1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/jsregexp.h"
6
7 #include <memory>
8
9 #include "src/base/platform/platform.h"
10 #include "src/compilation-cache.h"
11 #include "src/elements.h"
12 #include "src/execution.h"
13 #include "src/factory.h"
14 #include "src/isolate-inl.h"
15 #include "src/messages.h"
16 #include "src/ostreams.h"
17 #include "src/regexp/interpreter-irregexp.h"
18 #include "src/regexp/jsregexp-inl.h"
19 #include "src/regexp/regexp-macro-assembler-irregexp.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-macro-assembler.h"
22 #include "src/regexp/regexp-parser.h"
23 #include "src/regexp/regexp-stack.h"
24 #include "src/runtime/runtime.h"
25 #include "src/splay-tree-inl.h"
26 #include "src/string-search.h"
27 #include "src/unicode-decoder.h"
28
29 #ifdef V8_I18N_SUPPORT
30 #include "unicode/uniset.h"
31 #include "unicode/utypes.h"
32 #endif // V8_I18N_SUPPORT
33
34 #ifndef V8_INTERPRETED_REGEXP
35 #if V8_TARGET_ARCH_IA32
36 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
37 #elif V8_TARGET_ARCH_X64
38 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
39 #elif V8_TARGET_ARCH_ARM64
40 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
41 #elif V8_TARGET_ARCH_ARM
42 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
43 #elif V8_TARGET_ARCH_PPC
44 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
45 #elif V8_TARGET_ARCH_S390
46 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
47 #elif V8_TARGET_ARCH_MIPS
48 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
49 #elif V8_TARGET_ARCH_MIPS64
50 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
51 #elif V8_TARGET_ARCH_X87
52 #include "src/regexp/x87/regexp-macro-assembler-x87.h"
53 #else
54 #error Unsupported target architecture.
55 #endif
56 #endif
57
58
59 namespace v8 {
60 namespace internal {
61
62 MUST_USE_RESULT
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text)63 static inline MaybeHandle<Object> ThrowRegExpException(
64 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
65 Isolate* isolate = re->GetIsolate();
66 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 pattern, error_text),
68 Object);
69 }
70
71
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> error_text)72 inline void ThrowRegExpException(Handle<JSRegExp> re,
73 Handle<String> error_text) {
74 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
75 }
76
77
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)78 ContainedInLattice AddRange(ContainedInLattice containment,
79 const int* ranges,
80 int ranges_length,
81 Interval new_range) {
82 DCHECK((ranges_length & 1) == 1);
83 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
84 if (containment == kLatticeUnknown) return containment;
85 bool inside = false;
86 int last = 0;
87 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 // Consider the range from last to ranges[i].
89 // We haven't got to the new range yet.
90 if (ranges[i] <= new_range.from()) continue;
91 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 // inclusive, but the values in ranges are not.
93 if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 }
96 return kLatticeUnknown;
97 }
98 return containment;
99 }
100
101
102 // More makes code generation slower, less makes V8 benchmark score lower.
103 const int kMaxLookaheadForBoyerMoore = 8;
104 // In a 3-character pattern you can maximally step forwards 3 characters
105 // at a time, which is not always enough to pay for the extra logic.
106 const int kPatternTooShortForBoyerMoore = 2;
107
108
109 // Identifies the sort of regexps where the regexp engine is faster
110 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)111 static bool HasFewDifferentCharacters(Handle<String> pattern) {
112 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
113 if (length <= kPatternTooShortForBoyerMoore) return false;
114 const int kMod = 128;
115 bool character_found[kMod];
116 int different = 0;
117 memset(&character_found[0], 0, sizeof(character_found));
118 for (int i = 0; i < length; i++) {
119 int ch = (pattern->Get(i) & (kMod - 1));
120 if (!character_found[ch]) {
121 character_found[ch] = true;
122 different++;
123 // We declare a regexp low-alphabet if it has at least 3 times as many
124 // characters as it has different characters.
125 if (different * 3 > length) return false;
126 }
127 }
128 return true;
129 }
130
131
132 // Generic RegExp methods. Dispatches to implementation specific methods.
133
134
Compile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags)135 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
136 Handle<String> pattern,
137 JSRegExp::Flags flags) {
138 Isolate* isolate = re->GetIsolate();
139 Zone zone(isolate->allocator(), ZONE_NAME);
140 CompilationCache* compilation_cache = isolate->compilation_cache();
141 MaybeHandle<FixedArray> maybe_cached =
142 compilation_cache->LookupRegExp(pattern, flags);
143 Handle<FixedArray> cached;
144 if (maybe_cached.ToHandle(&cached)) {
145 re->set_data(*cached);
146 return re;
147 }
148 pattern = String::Flatten(pattern);
149 PostponeInterruptsScope postpone(isolate);
150 RegExpCompileData parse_result;
151 FlatStringReader reader(isolate, pattern);
152 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
153 &parse_result)) {
154 // Throw an exception if we fail to parse the pattern.
155 return ThrowRegExpException(re, pattern, parse_result.error);
156 }
157
158 bool has_been_compiled = false;
159
160 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
161 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
162 // Parse-tree is a single atom that is equal to the pattern.
163 AtomCompile(re, pattern, flags, pattern);
164 has_been_compiled = true;
165 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
166 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
167 RegExpAtom* atom = parse_result.tree->AsAtom();
168 Vector<const uc16> atom_pattern = atom->data();
169 Handle<String> atom_string;
170 ASSIGN_RETURN_ON_EXCEPTION(
171 isolate, atom_string,
172 isolate->factory()->NewStringFromTwoByte(atom_pattern),
173 Object);
174 if (!HasFewDifferentCharacters(atom_string)) {
175 AtomCompile(re, pattern, flags, atom_string);
176 has_been_compiled = true;
177 }
178 }
179 if (!has_been_compiled) {
180 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
181 }
182 DCHECK(re->data()->IsFixedArray());
183 // Compilation succeeded so the data is set on the regexp
184 // and we can store it in the cache.
185 Handle<FixedArray> data(FixedArray::cast(re->data()));
186 compilation_cache->PutRegExp(pattern, flags, data);
187
188 return re;
189 }
190
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)191 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
192 Handle<String> subject, int index,
193 Handle<RegExpMatchInfo> last_match_info) {
194 switch (regexp->TypeTag()) {
195 case JSRegExp::ATOM:
196 return AtomExec(regexp, subject, index, last_match_info);
197 case JSRegExp::IRREGEXP: {
198 return IrregexpExec(regexp, subject, index, last_match_info);
199 }
200 default:
201 UNREACHABLE();
202 return MaybeHandle<Object>();
203 }
204 }
205
206
207 // RegExp Atom implementation: Simple string search using indexOf.
208
209
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)210 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
211 Handle<String> pattern,
212 JSRegExp::Flags flags,
213 Handle<String> match_pattern) {
214 re->GetIsolate()->factory()->SetRegExpAtomData(re,
215 JSRegExp::ATOM,
216 pattern,
217 flags,
218 match_pattern);
219 }
220
SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,String * subject,int from,int to)221 static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,
222 String* subject, int from, int to) {
223 SealHandleScope shs(last_match_info->GetIsolate());
224 last_match_info->SetNumberOfCaptureRegisters(2);
225 last_match_info->SetLastSubject(subject);
226 last_match_info->SetLastInput(subject);
227 last_match_info->SetCapture(0, from);
228 last_match_info->SetCapture(1, to);
229 }
230
231
AtomExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)232 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
233 Handle<String> subject,
234 int index,
235 int32_t* output,
236 int output_size) {
237 Isolate* isolate = regexp->GetIsolate();
238
239 DCHECK(0 <= index);
240 DCHECK(index <= subject->length());
241
242 subject = String::Flatten(subject);
243 DisallowHeapAllocation no_gc; // ensure vectors stay valid
244
245 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
246 int needle_len = needle->length();
247 DCHECK(needle->IsFlat());
248 DCHECK_LT(0, needle_len);
249
250 if (index + needle_len > subject->length()) {
251 return RegExpImpl::RE_FAILURE;
252 }
253
254 for (int i = 0; i < output_size; i += 2) {
255 String::FlatContent needle_content = needle->GetFlatContent();
256 String::FlatContent subject_content = subject->GetFlatContent();
257 DCHECK(needle_content.IsFlat());
258 DCHECK(subject_content.IsFlat());
259 // dispatch on type of strings
260 index =
261 (needle_content.IsOneByte()
262 ? (subject_content.IsOneByte()
263 ? SearchString(isolate, subject_content.ToOneByteVector(),
264 needle_content.ToOneByteVector(), index)
265 : SearchString(isolate, subject_content.ToUC16Vector(),
266 needle_content.ToOneByteVector(), index))
267 : (subject_content.IsOneByte()
268 ? SearchString(isolate, subject_content.ToOneByteVector(),
269 needle_content.ToUC16Vector(), index)
270 : SearchString(isolate, subject_content.ToUC16Vector(),
271 needle_content.ToUC16Vector(), index)));
272 if (index == -1) {
273 return i / 2; // Return number of matches.
274 } else {
275 output[i] = index;
276 output[i+1] = index + needle_len;
277 index += needle_len;
278 }
279 }
280 return output_size / 2;
281 }
282
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)283 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
284 int index,
285 Handle<RegExpMatchInfo> last_match_info) {
286 Isolate* isolate = re->GetIsolate();
287
288 static const int kNumRegisters = 2;
289 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
290 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
291
292 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
293
294 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
295
296 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
297 SealHandleScope shs(isolate);
298 SetAtomLastCapture(last_match_info, *subject, output_registers[0],
299 output_registers[1]);
300 return last_match_info;
301 }
302
303
304 // Irregexp implementation.
305
306 // Ensures that the regexp object contains a compiled version of the
307 // source for either one-byte or two-byte subject strings.
308 // If the compiled version doesn't already exist, it is compiled
309 // from the source pattern.
310 // If compilation fails, an exception is thrown and this function
311 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)312 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
313 Handle<String> sample_subject,
314 bool is_one_byte) {
315 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
316 #ifdef V8_INTERPRETED_REGEXP
317 if (compiled_code->IsByteArray()) return true;
318 #else // V8_INTERPRETED_REGEXP (RegExp native code)
319 if (compiled_code->IsCode()) return true;
320 #endif
321 // We could potentially have marked this as flushable, but have kept
322 // a saved version if we did not flush it yet.
323 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
324 if (saved_code->IsCode()) {
325 // Reinstate the code in the original place.
326 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
327 DCHECK(compiled_code->IsSmi());
328 return true;
329 }
330 return CompileIrregexp(re, sample_subject, is_one_byte);
331 }
332
333
CompileIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)334 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
335 Handle<String> sample_subject,
336 bool is_one_byte) {
337 // Compile the RegExp.
338 Isolate* isolate = re->GetIsolate();
339 Zone zone(isolate->allocator(), ZONE_NAME);
340 PostponeInterruptsScope postpone(isolate);
341 // If we had a compilation error the last time this is saved at the
342 // saved code index.
343 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
344 // When arriving here entry can only be a smi, either representing an
345 // uncompiled regexp, a previous compilation error, or code that has
346 // been flushed.
347 DCHECK(entry->IsSmi());
348 int entry_value = Smi::cast(entry)->value();
349 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
350 entry_value == JSRegExp::kCompilationErrorValue ||
351 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
352
353 if (entry_value == JSRegExp::kCompilationErrorValue) {
354 // A previous compilation failed and threw an error which we store in
355 // the saved code index (we store the error message, not the actual
356 // error). Recreate the error object and throw it.
357 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
358 DCHECK(error_string->IsString());
359 Handle<String> error_message(String::cast(error_string));
360 ThrowRegExpException(re, error_message);
361 return false;
362 }
363
364 JSRegExp::Flags flags = re->GetFlags();
365
366 Handle<String> pattern(re->Pattern());
367 pattern = String::Flatten(pattern);
368 RegExpCompileData compile_data;
369 FlatStringReader reader(isolate, pattern);
370 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
371 &compile_data)) {
372 // Throw an exception if we fail to parse the pattern.
373 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
374 USE(ThrowRegExpException(re, pattern, compile_data.error));
375 return false;
376 }
377 RegExpEngine::CompilationResult result =
378 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
379 sample_subject, is_one_byte);
380 if (result.error_message != NULL) {
381 // Unable to compile regexp.
382 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
383 CStrVector(result.error_message)).ToHandleChecked();
384 ThrowRegExpException(re, error_message);
385 return false;
386 }
387
388 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
389 data->set(JSRegExp::code_index(is_one_byte), result.code);
390 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
391 int register_max = IrregexpMaxRegisterCount(*data);
392 if (result.num_registers > register_max) {
393 SetIrregexpMaxRegisterCount(*data, result.num_registers);
394 }
395
396 return true;
397 }
398
399
IrregexpMaxRegisterCount(FixedArray * re)400 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
401 return Smi::cast(
402 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
403 }
404
405
SetIrregexpMaxRegisterCount(FixedArray * re,int value)406 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
407 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
408 }
409
SetIrregexpCaptureNameMap(FixedArray * re,Handle<FixedArray> value)410 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
411 Handle<FixedArray> value) {
412 if (value.is_null()) {
413 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
414 } else {
415 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
416 }
417 }
418
IrregexpNumberOfCaptures(FixedArray * re)419 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
420 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
421 }
422
423
IrregexpNumberOfRegisters(FixedArray * re)424 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
425 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
426 }
427
428
IrregexpByteCode(FixedArray * re,bool is_one_byte)429 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
430 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
431 }
432
433
IrregexpNativeCode(FixedArray * re,bool is_one_byte)434 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
435 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
436 }
437
438
IrregexpInitialize(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)439 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
440 Handle<String> pattern,
441 JSRegExp::Flags flags,
442 int capture_count) {
443 // Initialize compiled code entries to null.
444 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
445 JSRegExp::IRREGEXP,
446 pattern,
447 flags,
448 capture_count);
449 }
450
451
IrregexpPrepare(Handle<JSRegExp> regexp,Handle<String> subject)452 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
453 Handle<String> subject) {
454 DCHECK(subject->IsFlat());
455
456 // Check representation of the underlying storage.
457 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
458 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
459
460 #ifdef V8_INTERPRETED_REGEXP
461 // Byte-code regexp needs space allocated for all its registers.
462 // The result captures are copied to the start of the registers array
463 // if the match succeeds. This way those registers are not clobbered
464 // when we set the last match info from last successful match.
465 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
466 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
467 #else // V8_INTERPRETED_REGEXP
468 // Native regexp only needs room to output captures. Registers are handled
469 // internally.
470 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
471 #endif // V8_INTERPRETED_REGEXP
472 }
473
474
IrregexpExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)475 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
476 Handle<String> subject,
477 int index,
478 int32_t* output,
479 int output_size) {
480 Isolate* isolate = regexp->GetIsolate();
481
482 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
483
484 DCHECK(index >= 0);
485 DCHECK(index <= subject->length());
486 DCHECK(subject->IsFlat());
487
488 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
489
490 #ifndef V8_INTERPRETED_REGEXP
491 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
492 do {
493 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
494 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
495 // The stack is used to allocate registers for the compiled regexp code.
496 // This means that in case of failure, the output registers array is left
497 // untouched and contains the capture results from the previous successful
498 // match. We can use that to set the last match info lazily.
499 NativeRegExpMacroAssembler::Result res =
500 NativeRegExpMacroAssembler::Match(code,
501 subject,
502 output,
503 output_size,
504 index,
505 isolate);
506 if (res != NativeRegExpMacroAssembler::RETRY) {
507 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
508 isolate->has_pending_exception());
509 STATIC_ASSERT(
510 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
511 STATIC_ASSERT(
512 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
513 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
514 == RE_EXCEPTION);
515 return static_cast<IrregexpResult>(res);
516 }
517 // If result is RETRY, the string has changed representation, and we
518 // must restart from scratch.
519 // In this case, it means we must make sure we are prepared to handle
520 // the, potentially, different subject (the string can switch between
521 // being internal and external, and even between being Latin1 and UC16,
522 // but the characters are always the same).
523 IrregexpPrepare(regexp, subject);
524 is_one_byte = subject->IsOneByteRepresentationUnderneath();
525 } while (true);
526 UNREACHABLE();
527 return RE_EXCEPTION;
528 #else // V8_INTERPRETED_REGEXP
529
530 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
531 // We must have done EnsureCompiledIrregexp, so we can get the number of
532 // registers.
533 int number_of_capture_registers =
534 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
535 int32_t* raw_output = &output[number_of_capture_registers];
536 // We do not touch the actual capture result registers until we know there
537 // has been a match so that we can use those capture results to set the
538 // last match info.
539 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
540 raw_output[i] = -1;
541 }
542 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
543 isolate);
544
545 IrregexpResult result = IrregexpInterpreter::Match(isolate,
546 byte_codes,
547 subject,
548 raw_output,
549 index);
550 if (result == RE_SUCCESS) {
551 // Copy capture results to the start of the registers array.
552 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
553 }
554 if (result == RE_EXCEPTION) {
555 DCHECK(!isolate->has_pending_exception());
556 isolate->StackOverflow();
557 }
558 return result;
559 #endif // V8_INTERPRETED_REGEXP
560 }
561
IrregexpExec(Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info)562 MaybeHandle<Object> RegExpImpl::IrregexpExec(
563 Handle<JSRegExp> regexp, Handle<String> subject, int previous_index,
564 Handle<RegExpMatchInfo> last_match_info) {
565 Isolate* isolate = regexp->GetIsolate();
566 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
567
568 subject = String::Flatten(subject);
569
570 // Prepare space for the return values.
571 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
572 if (FLAG_trace_regexp_bytecodes) {
573 String* pattern = regexp->Pattern();
574 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
575 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
576 }
577 #endif
578 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
579 if (required_registers < 0) {
580 // Compiling failed with an exception.
581 DCHECK(isolate->has_pending_exception());
582 return MaybeHandle<Object>();
583 }
584
585 int32_t* output_registers = NULL;
586 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
587 output_registers = NewArray<int32_t>(required_registers);
588 }
589 std::unique_ptr<int32_t[]> auto_release(output_registers);
590 if (output_registers == NULL) {
591 output_registers = isolate->jsregexp_static_offsets_vector();
592 }
593
594 int res = RegExpImpl::IrregexpExecRaw(
595 regexp, subject, previous_index, output_registers, required_registers);
596 if (res == RE_SUCCESS) {
597 int capture_count =
598 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
599 return SetLastMatchInfo(
600 last_match_info, subject, capture_count, output_registers);
601 }
602 if (res == RE_EXCEPTION) {
603 DCHECK(isolate->has_pending_exception());
604 return MaybeHandle<Object>();
605 }
606 DCHECK(res == RE_FAILURE);
607 return isolate->factory()->null_value();
608 }
609
SetLastMatchInfo(Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)610 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
611 Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
612 int capture_count, int32_t* match) {
613 // This is the only place where match infos can grow. If, after executing the
614 // regexp, RegExpExecStub finds that the match info is too small, it restarts
615 // execution in RegExpImpl::Exec, which finally grows the match info right
616 // here.
617
618 int capture_register_count = (capture_count + 1) * 2;
619 Handle<RegExpMatchInfo> result =
620 RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count);
621 result->SetNumberOfCaptureRegisters(capture_register_count);
622
623 if (*result != *last_match_info) {
624 // The match info has been reallocated, update the corresponding reference
625 // on the native context.
626 Isolate* isolate = last_match_info->GetIsolate();
627 if (*last_match_info == *isolate->regexp_last_match_info()) {
628 isolate->native_context()->set_regexp_last_match_info(*result);
629 } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
630 isolate->native_context()->set_regexp_internal_match_info(*result);
631 }
632 }
633
634 DisallowHeapAllocation no_allocation;
635 if (match != NULL) {
636 for (int i = 0; i < capture_register_count; i += 2) {
637 result->SetCapture(i, match[i]);
638 result->SetCapture(i + 1, match[i + 1]);
639 }
640 }
641 result->SetLastSubject(*subject);
642 result->SetLastInput(*subject);
643 return result;
644 }
645
646
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)647 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
648 Handle<String> subject,
649 Isolate* isolate)
650 : register_array_(NULL),
651 register_array_size_(0),
652 regexp_(regexp),
653 subject_(subject) {
654 #ifdef V8_INTERPRETED_REGEXP
655 bool interpreted = true;
656 #else
657 bool interpreted = false;
658 #endif // V8_INTERPRETED_REGEXP
659
660 if (regexp_->TypeTag() == JSRegExp::ATOM) {
661 static const int kAtomRegistersPerMatch = 2;
662 registers_per_match_ = kAtomRegistersPerMatch;
663 // There is no distinction between interpreted and native for atom regexps.
664 interpreted = false;
665 } else {
666 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
667 if (registers_per_match_ < 0) {
668 num_matches_ = -1; // Signal exception.
669 return;
670 }
671 }
672
673 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
674 if (!interpreted) {
675 register_array_size_ =
676 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
677 max_matches_ = register_array_size_ / registers_per_match_;
678 } else {
679 // Global loop in interpreted regexp is not implemented. We choose
680 // the size of the offsets vector so that it can only store one match.
681 register_array_size_ = registers_per_match_;
682 max_matches_ = 1;
683 }
684
685 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
686 register_array_ = NewArray<int32_t>(register_array_size_);
687 } else {
688 register_array_ = isolate->jsregexp_static_offsets_vector();
689 }
690
691 // Set state so that fetching the results the first time triggers a call
692 // to the compiled regexp.
693 current_match_index_ = max_matches_ - 1;
694 num_matches_ = max_matches_;
695 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
696 DCHECK_GE(register_array_size_, registers_per_match_);
697 int32_t* last_match =
698 ®ister_array_[current_match_index_ * registers_per_match_];
699 last_match[0] = -1;
700 last_match[1] = 0;
701 }
702
AdvanceZeroLength(int last_index)703 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
704 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
705 last_index + 1 < subject_->length() &&
706 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
707 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
708 // Advance over the surrogate pair.
709 return last_index + 2;
710 }
711 return last_index + 1;
712 }
713
714 // -------------------------------------------------------------------
715 // Implementation of the Irregexp regular expression engine.
716 //
717 // The Irregexp regular expression engine is intended to be a complete
718 // implementation of ECMAScript regular expressions. It generates either
719 // bytecodes or native code.
720
721 // The Irregexp regexp engine is structured in three steps.
722 // 1) The parser generates an abstract syntax tree. See ast.cc.
723 // 2) From the AST a node network is created. The nodes are all
724 // subclasses of RegExpNode. The nodes represent states when
725 // executing a regular expression. Several optimizations are
726 // performed on the node network.
727 // 3) From the nodes we generate either byte codes or native code
728 // that can actually execute the regular expression (perform
729 // the search). The code generation step is described in more
730 // detail below.
731
732 // Code generation.
733 //
734 // The nodes are divided into four main categories.
735 // * Choice nodes
736 // These represent places where the regular expression can
737 // match in more than one way. For example on entry to an
738 // alternation (foo|bar) or a repetition (*, +, ? or {}).
739 // * Action nodes
740 // These represent places where some action should be
741 // performed. Examples include recording the current position
742 // in the input string to a register (in order to implement
743 // captures) or other actions on register for example in order
744 // to implement the counters needed for {} repetitions.
745 // * Matching nodes
746 // These attempt to match some element part of the input string.
747 // Examples of elements include character classes, plain strings
748 // or back references.
749 // * End nodes
750 // These are used to implement the actions required on finding
751 // a successful match or failing to find a match.
752 //
753 // The code generated (whether as byte codes or native code) maintains
754 // some state as it runs. This consists of the following elements:
755 //
756 // * The capture registers. Used for string captures.
757 // * Other registers. Used for counters etc.
758 // * The current position.
759 // * The stack of backtracking information. Used when a matching node
760 // fails to find a match and needs to try an alternative.
761 //
762 // Conceptual regular expression execution model:
763 //
764 // There is a simple conceptual model of regular expression execution
765 // which will be presented first. The actual code generated is a more
766 // efficient simulation of the simple conceptual model:
767 //
768 // * Choice nodes are implemented as follows:
769 // For each choice except the last {
770 // push current position
771 // push backtrack code location
772 // <generate code to test for choice>
773 // backtrack code location:
774 // pop current position
775 // }
776 // <generate code to test for last choice>
777 //
778 // * Actions nodes are generated as follows
779 // <push affected registers on backtrack stack>
780 // <generate code to perform action>
781 // push backtrack code location
782 // <generate code to test for following nodes>
783 // backtrack code location:
784 // <pop affected registers to restore their state>
785 // <pop backtrack location from stack and go to it>
786 //
787 // * Matching nodes are generated as follows:
788 // if input string matches at current position
789 // update current position
790 // <generate code to test for following nodes>
791 // else
792 // <pop backtrack location from stack and go to it>
793 //
794 // Thus it can be seen that the current position is saved and restored
795 // by the choice nodes, whereas the registers are saved and restored by
796 // by the action nodes that manipulate them.
797 //
798 // The other interesting aspect of this model is that nodes are generated
799 // at the point where they are needed by a recursive call to Emit(). If
800 // the node has already been code generated then the Emit() call will
801 // generate a jump to the previously generated code instead. In order to
802 // limit recursion it is possible for the Emit() function to put the node
803 // on a work list for later generation and instead generate a jump. The
804 // destination of the jump is resolved later when the code is generated.
805 //
806 // Actual regular expression code generation.
807 //
808 // Code generation is actually more complicated than the above. In order
809 // to improve the efficiency of the generated code some optimizations are
810 // performed
811 //
812 // * Choice nodes have 1-character lookahead.
813 // A choice node looks at the following character and eliminates some of
814 // the choices immediately based on that character. This is not yet
815 // implemented.
816 // * Simple greedy loops store reduced backtracking information.
817 // A quantifier like /.*foo/m will greedily match the whole input. It will
818 // then need to backtrack to a point where it can match "foo". The naive
819 // implementation of this would push each character position onto the
820 // backtracking stack, then pop them off one by one. This would use space
821 // proportional to the length of the input string. However since the "."
822 // can only match in one way and always has a constant length (in this case
823 // of 1) it suffices to store the current position on the top of the stack
824 // once. Matching now becomes merely incrementing the current position and
825 // backtracking becomes decrementing the current position and checking the
826 // result against the stored current position. This is faster and saves
827 // space.
828 // * The current state is virtualized.
829 // This is used to defer expensive operations until it is clear that they
830 // are needed and to generate code for a node more than once, allowing
831 // specialized an efficient versions of the code to be created. This is
832 // explained in the section below.
833 //
834 // Execution state virtualization.
835 //
836 // Instead of emitting code, nodes that manipulate the state can record their
837 // manipulation in an object called the Trace. The Trace object can record a
838 // current position offset, an optional backtrack code location on the top of
839 // the virtualized backtrack stack and some register changes. When a node is
840 // to be emitted it can flush the Trace or update it. Flushing the Trace
841 // will emit code to bring the actual state into line with the virtual state.
842 // Avoiding flushing the state can postpone some work (e.g. updates of capture
843 // registers). Postponing work can save time when executing the regular
844 // expression since it may be found that the work never has to be done as a
845 // failure to match can occur. In addition it is much faster to jump to a
846 // known backtrack code location than it is to pop an unknown backtrack
847 // location from the stack and jump there.
848 //
849 // The virtual state found in the Trace affects code generation. For example
850 // the virtual state contains the difference between the actual current
851 // position and the virtual current position, and matching code needs to use
852 // this offset to attempt a match in the correct location of the input
853 // string. Therefore code generated for a non-trivial trace is specialized
854 // to that trace. The code generator therefore has the ability to generate
855 // code for each node several times. In order to limit the size of the
856 // generated code there is an arbitrary limit on how many specialized sets of
857 // code may be generated for a given node. If the limit is reached, the
858 // trace is flushed and a generic version of the code for a node is emitted.
859 // This is subsequently used for that node. The code emitted for non-generic
860 // trace is not recorded in the node and so it cannot currently be reused in
861 // the event that code generation is requested for an identical trace.
862
863
AppendToText(RegExpText * text,Zone * zone)864 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
865 UNREACHABLE();
866 }
867
868
AppendToText(RegExpText * text,Zone * zone)869 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
870 text->AddElement(TextElement::Atom(this), zone);
871 }
872
873
AppendToText(RegExpText * text,Zone * zone)874 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
875 text->AddElement(TextElement::CharClass(this), zone);
876 }
877
878
AppendToText(RegExpText * text,Zone * zone)879 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
880 for (int i = 0; i < elements()->length(); i++)
881 text->AddElement(elements()->at(i), zone);
882 }
883
884
Atom(RegExpAtom * atom)885 TextElement TextElement::Atom(RegExpAtom* atom) {
886 return TextElement(ATOM, atom);
887 }
888
889
CharClass(RegExpCharacterClass * char_class)890 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
891 return TextElement(CHAR_CLASS, char_class);
892 }
893
894
length() const895 int TextElement::length() const {
896 switch (text_type()) {
897 case ATOM:
898 return atom()->length();
899
900 case CHAR_CLASS:
901 return 1;
902 }
903 UNREACHABLE();
904 return 0;
905 }
906
907
GetTable(bool ignore_case)908 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
909 if (table_ == NULL) {
910 table_ = new(zone()) DispatchTable(zone());
911 DispatchTableConstructor cons(table_, ignore_case, zone());
912 cons.BuildTable(this);
913 }
914 return table_;
915 }
916
917
918 class FrequencyCollator {
919 public:
FrequencyCollator()920 FrequencyCollator() : total_samples_(0) {
921 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
922 frequencies_[i] = CharacterFrequency(i);
923 }
924 }
925
CountCharacter(int character)926 void CountCharacter(int character) {
927 int index = (character & RegExpMacroAssembler::kTableMask);
928 frequencies_[index].Increment();
929 total_samples_++;
930 }
931
932 // Does not measure in percent, but rather per-128 (the table size from the
933 // regexp macro assembler).
Frequency(int in_character)934 int Frequency(int in_character) {
935 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
936 if (total_samples_ < 1) return 1; // Division by zero.
937 int freq_in_per128 =
938 (frequencies_[in_character].counter() * 128) / total_samples_;
939 return freq_in_per128;
940 }
941
942 private:
943 class CharacterFrequency {
944 public:
CharacterFrequency()945 CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)946 explicit CharacterFrequency(int character)
947 : counter_(0), character_(character) { }
948
Increment()949 void Increment() { counter_++; }
counter()950 int counter() { return counter_; }
character()951 int character() { return character_; }
952
953 private:
954 int counter_;
955 int character_;
956 };
957
958
959 private:
960 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
961 int total_samples_;
962 };
963
964
965 class RegExpCompiler {
966 public:
967 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
968 JSRegExp::Flags flags, bool is_one_byte);
969
AllocateRegister()970 int AllocateRegister() {
971 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
972 reg_exp_too_big_ = true;
973 return next_register_;
974 }
975 return next_register_++;
976 }
977
978 // Lookarounds to match lone surrogates for unicode character class matches
979 // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()980 int UnicodeLookaroundStackRegister() {
981 if (unicode_lookaround_stack_register_ == kNoRegister) {
982 unicode_lookaround_stack_register_ = AllocateRegister();
983 }
984 return unicode_lookaround_stack_register_;
985 }
986
UnicodeLookaroundPositionRegister()987 int UnicodeLookaroundPositionRegister() {
988 if (unicode_lookaround_position_register_ == kNoRegister) {
989 unicode_lookaround_position_register_ = AllocateRegister();
990 }
991 return unicode_lookaround_position_register_;
992 }
993
994 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
995 RegExpNode* start,
996 int capture_count,
997 Handle<String> pattern);
998
AddWork(RegExpNode * node)999 inline void AddWork(RegExpNode* node) {
1000 if (!node->on_work_list() && !node->label()->is_bound()) {
1001 node->set_on_work_list(true);
1002 work_list_->Add(node);
1003 }
1004 }
1005
1006 static const int kImplementationOffset = 0;
1007 static const int kNumberOfRegistersOffset = 0;
1008 static const int kCodeOffset = 1;
1009
macro_assembler()1010 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()1011 EndNode* accept() { return accept_; }
1012
1013 static const int kMaxRecursion = 100;
recursion_depth()1014 inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()1015 inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()1016 inline void DecrementRecursionDepth() { recursion_depth_--; }
1017
SetRegExpTooBig()1018 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1019
ignore_case()1020 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
unicode()1021 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
one_byte()1022 inline bool one_byte() { return one_byte_; }
optimize()1023 inline bool optimize() { return optimize_; }
set_optimize(bool value)1024 inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()1025 inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)1026 inline void set_limiting_recursion(bool value) {
1027 limiting_recursion_ = value;
1028 }
read_backward()1029 bool read_backward() { return read_backward_; }
set_read_backward(bool value)1030 void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()1031 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1032
current_expansion_factor()1033 int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)1034 void set_current_expansion_factor(int value) {
1035 current_expansion_factor_ = value;
1036 }
1037
isolate() const1038 Isolate* isolate() const { return isolate_; }
zone() const1039 Zone* zone() const { return zone_; }
1040
1041 static const int kNoRegister = -1;
1042
1043 private:
1044 EndNode* accept_;
1045 int next_register_;
1046 int unicode_lookaround_stack_register_;
1047 int unicode_lookaround_position_register_;
1048 List<RegExpNode*>* work_list_;
1049 int recursion_depth_;
1050 RegExpMacroAssembler* macro_assembler_;
1051 JSRegExp::Flags flags_;
1052 bool one_byte_;
1053 bool reg_exp_too_big_;
1054 bool limiting_recursion_;
1055 bool optimize_;
1056 bool read_backward_;
1057 int current_expansion_factor_;
1058 FrequencyCollator frequency_collator_;
1059 Isolate* isolate_;
1060 Zone* zone_;
1061 };
1062
1063
1064 class RecursionCheck {
1065 public:
RecursionCheck(RegExpCompiler * compiler)1066 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1067 compiler->IncrementRecursionDepth();
1068 }
~RecursionCheck()1069 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1070 private:
1071 RegExpCompiler* compiler_;
1072 };
1073
1074
IrregexpRegExpTooBig(Isolate * isolate)1075 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1076 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1077 }
1078
1079
1080 // Attempts to compile the regexp using an Irregexp code generator. Returns
1081 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,JSRegExp::Flags flags,bool one_byte)1082 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1083 JSRegExp::Flags flags, bool one_byte)
1084 : next_register_(2 * (capture_count + 1)),
1085 unicode_lookaround_stack_register_(kNoRegister),
1086 unicode_lookaround_position_register_(kNoRegister),
1087 work_list_(NULL),
1088 recursion_depth_(0),
1089 flags_(flags),
1090 one_byte_(one_byte),
1091 reg_exp_too_big_(false),
1092 limiting_recursion_(false),
1093 optimize_(FLAG_regexp_optimization),
1094 read_backward_(false),
1095 current_expansion_factor_(1),
1096 frequency_collator_(),
1097 isolate_(isolate),
1098 zone_(zone) {
1099 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1100 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1101 }
1102
1103
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1104 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1105 RegExpMacroAssembler* macro_assembler,
1106 RegExpNode* start,
1107 int capture_count,
1108 Handle<String> pattern) {
1109 Isolate* isolate = pattern->GetHeap()->isolate();
1110
1111 #ifdef DEBUG
1112 if (FLAG_trace_regexp_assembler)
1113 macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1114 else
1115 #endif
1116 macro_assembler_ = macro_assembler;
1117
1118 List <RegExpNode*> work_list(0);
1119 work_list_ = &work_list;
1120 Label fail;
1121 macro_assembler_->PushBacktrack(&fail);
1122 Trace new_trace;
1123 start->Emit(this, &new_trace);
1124 macro_assembler_->Bind(&fail);
1125 macro_assembler_->Fail();
1126 while (!work_list.is_empty()) {
1127 RegExpNode* node = work_list.RemoveLast();
1128 node->set_on_work_list(false);
1129 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1130 }
1131 if (reg_exp_too_big_) {
1132 macro_assembler_->AbortedCodeGeneration();
1133 return IrregexpRegExpTooBig(isolate_);
1134 }
1135
1136 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1137 isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1138 work_list_ = NULL;
1139 #ifdef ENABLE_DISASSEMBLER
1140 if (FLAG_print_code) {
1141 CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1142 OFStream os(trace_scope.file());
1143 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1144 }
1145 #endif
1146 #ifdef DEBUG
1147 if (FLAG_trace_regexp_assembler) {
1148 delete macro_assembler_;
1149 }
1150 #endif
1151 return RegExpEngine::CompilationResult(*code, next_register_);
1152 }
1153
1154
Mentions(int that)1155 bool Trace::DeferredAction::Mentions(int that) {
1156 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1157 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1158 return range.Contains(that);
1159 } else {
1160 return reg() == that;
1161 }
1162 }
1163
1164
mentions_reg(int reg)1165 bool Trace::mentions_reg(int reg) {
1166 for (DeferredAction* action = actions_;
1167 action != NULL;
1168 action = action->next()) {
1169 if (action->Mentions(reg))
1170 return true;
1171 }
1172 return false;
1173 }
1174
1175
GetStoredPosition(int reg,int * cp_offset)1176 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1177 DCHECK_EQ(0, *cp_offset);
1178 for (DeferredAction* action = actions_;
1179 action != NULL;
1180 action = action->next()) {
1181 if (action->Mentions(reg)) {
1182 if (action->action_type() == ActionNode::STORE_POSITION) {
1183 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1184 return true;
1185 } else {
1186 return false;
1187 }
1188 }
1189 }
1190 return false;
1191 }
1192
1193
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1194 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1195 Zone* zone) {
1196 int max_register = RegExpCompiler::kNoRegister;
1197 for (DeferredAction* action = actions_;
1198 action != NULL;
1199 action = action->next()) {
1200 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1201 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1202 for (int i = range.from(); i <= range.to(); i++)
1203 affected_registers->Set(i, zone);
1204 if (range.to() > max_register) max_register = range.to();
1205 } else {
1206 affected_registers->Set(action->reg(), zone);
1207 if (action->reg() > max_register) max_register = action->reg();
1208 }
1209 }
1210 return max_register;
1211 }
1212
1213
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1214 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1215 int max_register,
1216 const OutSet& registers_to_pop,
1217 const OutSet& registers_to_clear) {
1218 for (int reg = max_register; reg >= 0; reg--) {
1219 if (registers_to_pop.Get(reg)) {
1220 assembler->PopRegister(reg);
1221 } else if (registers_to_clear.Get(reg)) {
1222 int clear_to = reg;
1223 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1224 reg--;
1225 }
1226 assembler->ClearRegisters(reg, clear_to);
1227 }
1228 }
1229 }
1230
1231
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1232 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1233 int max_register,
1234 const OutSet& affected_registers,
1235 OutSet* registers_to_pop,
1236 OutSet* registers_to_clear,
1237 Zone* zone) {
1238 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1239 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1240
1241 // Count pushes performed to force a stack limit check occasionally.
1242 int pushes = 0;
1243
1244 for (int reg = 0; reg <= max_register; reg++) {
1245 if (!affected_registers.Get(reg)) {
1246 continue;
1247 }
1248
1249 // The chronologically first deferred action in the trace
1250 // is used to infer the action needed to restore a register
1251 // to its previous state (or not, if it's safe to ignore it).
1252 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1253 DeferredActionUndoType undo_action = IGNORE;
1254
1255 int value = 0;
1256 bool absolute = false;
1257 bool clear = false;
1258 static const int kNoStore = kMinInt;
1259 int store_position = kNoStore;
1260 // This is a little tricky because we are scanning the actions in reverse
1261 // historical order (newest first).
1262 for (DeferredAction* action = actions_;
1263 action != NULL;
1264 action = action->next()) {
1265 if (action->Mentions(reg)) {
1266 switch (action->action_type()) {
1267 case ActionNode::SET_REGISTER: {
1268 Trace::DeferredSetRegister* psr =
1269 static_cast<Trace::DeferredSetRegister*>(action);
1270 if (!absolute) {
1271 value += psr->value();
1272 absolute = true;
1273 }
1274 // SET_REGISTER is currently only used for newly introduced loop
1275 // counters. They can have a significant previous value if they
1276 // occour in a loop. TODO(lrn): Propagate this information, so
1277 // we can set undo_action to IGNORE if we know there is no value to
1278 // restore.
1279 undo_action = RESTORE;
1280 DCHECK_EQ(store_position, kNoStore);
1281 DCHECK(!clear);
1282 break;
1283 }
1284 case ActionNode::INCREMENT_REGISTER:
1285 if (!absolute) {
1286 value++;
1287 }
1288 DCHECK_EQ(store_position, kNoStore);
1289 DCHECK(!clear);
1290 undo_action = RESTORE;
1291 break;
1292 case ActionNode::STORE_POSITION: {
1293 Trace::DeferredCapture* pc =
1294 static_cast<Trace::DeferredCapture*>(action);
1295 if (!clear && store_position == kNoStore) {
1296 store_position = pc->cp_offset();
1297 }
1298
1299 // For captures we know that stores and clears alternate.
1300 // Other register, are never cleared, and if the occur
1301 // inside a loop, they might be assigned more than once.
1302 if (reg <= 1) {
1303 // Registers zero and one, aka "capture zero", is
1304 // always set correctly if we succeed. There is no
1305 // need to undo a setting on backtrack, because we
1306 // will set it again or fail.
1307 undo_action = IGNORE;
1308 } else {
1309 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1310 }
1311 DCHECK(!absolute);
1312 DCHECK_EQ(value, 0);
1313 break;
1314 }
1315 case ActionNode::CLEAR_CAPTURES: {
1316 // Since we're scanning in reverse order, if we've already
1317 // set the position we have to ignore historically earlier
1318 // clearing operations.
1319 if (store_position == kNoStore) {
1320 clear = true;
1321 }
1322 undo_action = RESTORE;
1323 DCHECK(!absolute);
1324 DCHECK_EQ(value, 0);
1325 break;
1326 }
1327 default:
1328 UNREACHABLE();
1329 break;
1330 }
1331 }
1332 }
1333 // Prepare for the undo-action (e.g., push if it's going to be popped).
1334 if (undo_action == RESTORE) {
1335 pushes++;
1336 RegExpMacroAssembler::StackCheckFlag stack_check =
1337 RegExpMacroAssembler::kNoStackLimitCheck;
1338 if (pushes == push_limit) {
1339 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1340 pushes = 0;
1341 }
1342
1343 assembler->PushRegister(reg, stack_check);
1344 registers_to_pop->Set(reg, zone);
1345 } else if (undo_action == CLEAR) {
1346 registers_to_clear->Set(reg, zone);
1347 }
1348 // Perform the chronologically last action (or accumulated increment)
1349 // for the register.
1350 if (store_position != kNoStore) {
1351 assembler->WriteCurrentPositionToRegister(reg, store_position);
1352 } else if (clear) {
1353 assembler->ClearRegisters(reg, reg);
1354 } else if (absolute) {
1355 assembler->SetRegister(reg, value);
1356 } else if (value != 0) {
1357 assembler->AdvanceRegister(reg, value);
1358 }
1359 }
1360 }
1361
1362
1363 // This is called as we come into a loop choice node and some other tricky
1364 // nodes. It normalizes the state of the code generator to ensure we can
1365 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1366 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1367 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1368
1369 DCHECK(!is_trivial());
1370
1371 if (actions_ == NULL && backtrack() == NULL) {
1372 // Here we just have some deferred cp advances to fix and we are back to
1373 // a normal situation. We may also have to forget some information gained
1374 // through a quick check that was already performed.
1375 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1376 // Create a new trivial state and generate the node with that.
1377 Trace new_state;
1378 successor->Emit(compiler, &new_state);
1379 return;
1380 }
1381
1382 // Generate deferred actions here along with code to undo them again.
1383 OutSet affected_registers;
1384
1385 if (backtrack() != NULL) {
1386 // Here we have a concrete backtrack location. These are set up by choice
1387 // nodes and so they indicate that we have a deferred save of the current
1388 // position which we may need to emit here.
1389 assembler->PushCurrentPosition();
1390 }
1391
1392 int max_register = FindAffectedRegisters(&affected_registers,
1393 compiler->zone());
1394 OutSet registers_to_pop;
1395 OutSet registers_to_clear;
1396 PerformDeferredActions(assembler,
1397 max_register,
1398 affected_registers,
1399 ®isters_to_pop,
1400 ®isters_to_clear,
1401 compiler->zone());
1402 if (cp_offset_ != 0) {
1403 assembler->AdvanceCurrentPosition(cp_offset_);
1404 }
1405
1406 // Create a new trivial state and generate the node with that.
1407 Label undo;
1408 assembler->PushBacktrack(&undo);
1409 if (successor->KeepRecursing(compiler)) {
1410 Trace new_state;
1411 successor->Emit(compiler, &new_state);
1412 } else {
1413 compiler->AddWork(successor);
1414 assembler->GoTo(successor->label());
1415 }
1416
1417 // On backtrack we need to restore state.
1418 assembler->Bind(&undo);
1419 RestoreAffectedRegisters(assembler,
1420 max_register,
1421 registers_to_pop,
1422 registers_to_clear);
1423 if (backtrack() == NULL) {
1424 assembler->Backtrack();
1425 } else {
1426 assembler->PopCurrentPosition();
1427 assembler->GoTo(backtrack());
1428 }
1429 }
1430
1431
Emit(RegExpCompiler * compiler,Trace * trace)1432 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1433 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1434
1435 // Omit flushing the trace. We discard the entire stack frame anyway.
1436
1437 if (!label()->is_bound()) {
1438 // We are completely independent of the trace, since we ignore it,
1439 // so this code can be used as the generic version.
1440 assembler->Bind(label());
1441 }
1442
1443 // Throw away everything on the backtrack stack since the start
1444 // of the negative submatch and restore the character position.
1445 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1446 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1447 if (clear_capture_count_ > 0) {
1448 // Clear any captures that might have been performed during the success
1449 // of the body of the negative look-ahead.
1450 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1451 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1452 }
1453 // Now that we have unwound the stack we find at the top of the stack the
1454 // backtrack that the BeginSubmatch node got.
1455 assembler->Backtrack();
1456 }
1457
1458
Emit(RegExpCompiler * compiler,Trace * trace)1459 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1460 if (!trace->is_trivial()) {
1461 trace->Flush(compiler, this);
1462 return;
1463 }
1464 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1465 if (!label()->is_bound()) {
1466 assembler->Bind(label());
1467 }
1468 switch (action_) {
1469 case ACCEPT:
1470 assembler->Succeed();
1471 return;
1472 case BACKTRACK:
1473 assembler->GoTo(trace->backtrack());
1474 return;
1475 case NEGATIVE_SUBMATCH_SUCCESS:
1476 // This case is handled in a different virtual method.
1477 UNREACHABLE();
1478 }
1479 UNIMPLEMENTED();
1480 }
1481
1482
AddGuard(Guard * guard,Zone * zone)1483 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1484 if (guards_ == NULL)
1485 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1486 guards_->Add(guard, zone);
1487 }
1488
1489
SetRegister(int reg,int val,RegExpNode * on_success)1490 ActionNode* ActionNode::SetRegister(int reg,
1491 int val,
1492 RegExpNode* on_success) {
1493 ActionNode* result =
1494 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1495 result->data_.u_store_register.reg = reg;
1496 result->data_.u_store_register.value = val;
1497 return result;
1498 }
1499
1500
IncrementRegister(int reg,RegExpNode * on_success)1501 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1502 ActionNode* result =
1503 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1504 result->data_.u_increment_register.reg = reg;
1505 return result;
1506 }
1507
1508
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1509 ActionNode* ActionNode::StorePosition(int reg,
1510 bool is_capture,
1511 RegExpNode* on_success) {
1512 ActionNode* result =
1513 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1514 result->data_.u_position_register.reg = reg;
1515 result->data_.u_position_register.is_capture = is_capture;
1516 return result;
1517 }
1518
1519
ClearCaptures(Interval range,RegExpNode * on_success)1520 ActionNode* ActionNode::ClearCaptures(Interval range,
1521 RegExpNode* on_success) {
1522 ActionNode* result =
1523 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1524 result->data_.u_clear_captures.range_from = range.from();
1525 result->data_.u_clear_captures.range_to = range.to();
1526 return result;
1527 }
1528
1529
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1530 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1531 int position_reg,
1532 RegExpNode* on_success) {
1533 ActionNode* result =
1534 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1535 result->data_.u_submatch.stack_pointer_register = stack_reg;
1536 result->data_.u_submatch.current_position_register = position_reg;
1537 return result;
1538 }
1539
1540
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1541 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1542 int position_reg,
1543 int clear_register_count,
1544 int clear_register_from,
1545 RegExpNode* on_success) {
1546 ActionNode* result =
1547 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1548 result->data_.u_submatch.stack_pointer_register = stack_reg;
1549 result->data_.u_submatch.current_position_register = position_reg;
1550 result->data_.u_submatch.clear_register_count = clear_register_count;
1551 result->data_.u_submatch.clear_register_from = clear_register_from;
1552 return result;
1553 }
1554
1555
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1556 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1557 int repetition_register,
1558 int repetition_limit,
1559 RegExpNode* on_success) {
1560 ActionNode* result =
1561 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1562 result->data_.u_empty_match_check.start_register = start_register;
1563 result->data_.u_empty_match_check.repetition_register = repetition_register;
1564 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1565 return result;
1566 }
1567
1568
1569 #define DEFINE_ACCEPT(Type) \
1570 void Type##Node::Accept(NodeVisitor* visitor) { \
1571 visitor->Visit##Type(this); \
1572 }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1573 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1574 #undef DEFINE_ACCEPT
1575
1576
1577 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1578 visitor->VisitLoopChoice(this);
1579 }
1580
1581
1582 // -------------------------------------------------------------------
1583 // Emit code.
1584
1585
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1586 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1587 Guard* guard,
1588 Trace* trace) {
1589 switch (guard->op()) {
1590 case Guard::LT:
1591 DCHECK(!trace->mentions_reg(guard->reg()));
1592 macro_assembler->IfRegisterGE(guard->reg(),
1593 guard->value(),
1594 trace->backtrack());
1595 break;
1596 case Guard::GEQ:
1597 DCHECK(!trace->mentions_reg(guard->reg()));
1598 macro_assembler->IfRegisterLT(guard->reg(),
1599 guard->value(),
1600 trace->backtrack());
1601 break;
1602 }
1603 }
1604
1605
1606 // Returns the number of characters in the equivalence class, omitting those
1607 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1608 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1609 bool one_byte_subject,
1610 unibrow::uchar* letters) {
1611 int length =
1612 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1613 // Unibrow returns 0 or 1 for characters where case independence is
1614 // trivial.
1615 if (length == 0) {
1616 letters[0] = character;
1617 length = 1;
1618 }
1619
1620 if (one_byte_subject) {
1621 int new_length = 0;
1622 for (int i = 0; i < length; i++) {
1623 if (letters[i] <= String::kMaxOneByteCharCode) {
1624 letters[new_length++] = letters[i];
1625 }
1626 }
1627 length = new_length;
1628 }
1629
1630 return length;
1631 }
1632
1633
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1634 static inline bool EmitSimpleCharacter(Isolate* isolate,
1635 RegExpCompiler* compiler,
1636 uc16 c,
1637 Label* on_failure,
1638 int cp_offset,
1639 bool check,
1640 bool preloaded) {
1641 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1642 bool bound_checked = false;
1643 if (!preloaded) {
1644 assembler->LoadCurrentCharacter(
1645 cp_offset,
1646 on_failure,
1647 check);
1648 bound_checked = true;
1649 }
1650 assembler->CheckNotCharacter(c, on_failure);
1651 return bound_checked;
1652 }
1653
1654
1655 // Only emits non-letters (things that don't have case). Only used for case
1656 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1657 static inline bool EmitAtomNonLetter(Isolate* isolate,
1658 RegExpCompiler* compiler,
1659 uc16 c,
1660 Label* on_failure,
1661 int cp_offset,
1662 bool check,
1663 bool preloaded) {
1664 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1665 bool one_byte = compiler->one_byte();
1666 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1667 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1668 if (length < 1) {
1669 // This can't match. Must be an one-byte subject and a non-one-byte
1670 // character. We do not need to do anything since the one-byte pass
1671 // already handled this.
1672 return false; // Bounds not checked.
1673 }
1674 bool checked = false;
1675 // We handle the length > 1 case in a later pass.
1676 if (length == 1) {
1677 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1678 // Can't match - see above.
1679 return false; // Bounds not checked.
1680 }
1681 if (!preloaded) {
1682 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1683 checked = check;
1684 }
1685 macro_assembler->CheckNotCharacter(c, on_failure);
1686 }
1687 return checked;
1688 }
1689
1690
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1691 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1692 bool one_byte, uc16 c1, uc16 c2,
1693 Label* on_failure) {
1694 uc16 char_mask;
1695 if (one_byte) {
1696 char_mask = String::kMaxOneByteCharCode;
1697 } else {
1698 char_mask = String::kMaxUtf16CodeUnit;
1699 }
1700 uc16 exor = c1 ^ c2;
1701 // Check whether exor has only one bit set.
1702 if (((exor - 1) & exor) == 0) {
1703 // If c1 and c2 differ only by one bit.
1704 // Ecma262UnCanonicalize always gives the highest number last.
1705 DCHECK(c2 > c1);
1706 uc16 mask = char_mask ^ exor;
1707 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1708 return true;
1709 }
1710 DCHECK(c2 > c1);
1711 uc16 diff = c2 - c1;
1712 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1713 // If the characters differ by 2^n but don't differ by one bit then
1714 // subtract the difference from the found character, then do the or
1715 // trick. We avoid the theoretical case where negative numbers are
1716 // involved in order to simplify code generation.
1717 uc16 mask = char_mask ^ diff;
1718 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1719 diff,
1720 mask,
1721 on_failure);
1722 return true;
1723 }
1724 return false;
1725 }
1726
1727
1728 typedef bool EmitCharacterFunction(Isolate* isolate,
1729 RegExpCompiler* compiler,
1730 uc16 c,
1731 Label* on_failure,
1732 int cp_offset,
1733 bool check,
1734 bool preloaded);
1735
1736 // Only emits letters (things that have case). Only used for case independent
1737 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1738 static inline bool EmitAtomLetter(Isolate* isolate,
1739 RegExpCompiler* compiler,
1740 uc16 c,
1741 Label* on_failure,
1742 int cp_offset,
1743 bool check,
1744 bool preloaded) {
1745 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1746 bool one_byte = compiler->one_byte();
1747 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1748 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1749 if (length <= 1) return false;
1750 // We may not need to check against the end of the input string
1751 // if this character lies before a character that matched.
1752 if (!preloaded) {
1753 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1754 }
1755 Label ok;
1756 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1757 switch (length) {
1758 case 2: {
1759 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1760 chars[1], on_failure)) {
1761 } else {
1762 macro_assembler->CheckCharacter(chars[0], &ok);
1763 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1764 macro_assembler->Bind(&ok);
1765 }
1766 break;
1767 }
1768 case 4:
1769 macro_assembler->CheckCharacter(chars[3], &ok);
1770 // Fall through!
1771 case 3:
1772 macro_assembler->CheckCharacter(chars[0], &ok);
1773 macro_assembler->CheckCharacter(chars[1], &ok);
1774 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1775 macro_assembler->Bind(&ok);
1776 break;
1777 default:
1778 UNREACHABLE();
1779 break;
1780 }
1781 return true;
1782 }
1783
1784
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1785 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1786 int border,
1787 Label* fall_through,
1788 Label* above_or_equal,
1789 Label* below) {
1790 if (below != fall_through) {
1791 masm->CheckCharacterLT(border, below);
1792 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1793 } else {
1794 masm->CheckCharacterGT(border - 1, above_or_equal);
1795 }
1796 }
1797
1798
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1799 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1800 int first,
1801 int last,
1802 Label* fall_through,
1803 Label* in_range,
1804 Label* out_of_range) {
1805 if (in_range == fall_through) {
1806 if (first == last) {
1807 masm->CheckNotCharacter(first, out_of_range);
1808 } else {
1809 masm->CheckCharacterNotInRange(first, last, out_of_range);
1810 }
1811 } else {
1812 if (first == last) {
1813 masm->CheckCharacter(first, in_range);
1814 } else {
1815 masm->CheckCharacterInRange(first, last, in_range);
1816 }
1817 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1818 }
1819 }
1820
1821
1822 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1823 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1824 static void EmitUseLookupTable(
1825 RegExpMacroAssembler* masm,
1826 ZoneList<int>* ranges,
1827 int start_index,
1828 int end_index,
1829 int min_char,
1830 Label* fall_through,
1831 Label* even_label,
1832 Label* odd_label) {
1833 static const int kSize = RegExpMacroAssembler::kTableSize;
1834 static const int kMask = RegExpMacroAssembler::kTableMask;
1835
1836 int base = (min_char & ~kMask);
1837 USE(base);
1838
1839 // Assert that everything is on one kTableSize page.
1840 for (int i = start_index; i <= end_index; i++) {
1841 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1842 }
1843 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1844
1845 char templ[kSize];
1846 Label* on_bit_set;
1847 Label* on_bit_clear;
1848 int bit;
1849 if (even_label == fall_through) {
1850 on_bit_set = odd_label;
1851 on_bit_clear = even_label;
1852 bit = 1;
1853 } else {
1854 on_bit_set = even_label;
1855 on_bit_clear = odd_label;
1856 bit = 0;
1857 }
1858 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1859 templ[i] = bit;
1860 }
1861 int j = 0;
1862 bit ^= 1;
1863 for (int i = start_index; i < end_index; i++) {
1864 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1865 templ[j] = bit;
1866 }
1867 bit ^= 1;
1868 }
1869 for (int i = j; i < kSize; i++) {
1870 templ[i] = bit;
1871 }
1872 Factory* factory = masm->isolate()->factory();
1873 // TODO(erikcorry): Cache these.
1874 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1875 for (int i = 0; i < kSize; i++) {
1876 ba->set(i, templ[i]);
1877 }
1878 masm->CheckBitInTable(ba, on_bit_set);
1879 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1880 }
1881
1882
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1883 static void CutOutRange(RegExpMacroAssembler* masm,
1884 ZoneList<int>* ranges,
1885 int start_index,
1886 int end_index,
1887 int cut_index,
1888 Label* even_label,
1889 Label* odd_label) {
1890 bool odd = (((cut_index - start_index) & 1) == 1);
1891 Label* in_range_label = odd ? odd_label : even_label;
1892 Label dummy;
1893 EmitDoubleBoundaryTest(masm,
1894 ranges->at(cut_index),
1895 ranges->at(cut_index + 1) - 1,
1896 &dummy,
1897 in_range_label,
1898 &dummy);
1899 DCHECK(!dummy.is_linked());
1900 // Cut out the single range by rewriting the array. This creates a new
1901 // range that is a merger of the two ranges on either side of the one we
1902 // are cutting out. The oddity of the labels is preserved.
1903 for (int j = cut_index; j > start_index; j--) {
1904 ranges->at(j) = ranges->at(j - 1);
1905 }
1906 for (int j = cut_index + 1; j < end_index; j++) {
1907 ranges->at(j) = ranges->at(j + 1);
1908 }
1909 }
1910
1911
1912 // Unicode case. Split the search space into kSize spaces that are handled
1913 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1914 static void SplitSearchSpace(ZoneList<int>* ranges,
1915 int start_index,
1916 int end_index,
1917 int* new_start_index,
1918 int* new_end_index,
1919 int* border) {
1920 static const int kSize = RegExpMacroAssembler::kTableSize;
1921 static const int kMask = RegExpMacroAssembler::kTableMask;
1922
1923 int first = ranges->at(start_index);
1924 int last = ranges->at(end_index) - 1;
1925
1926 *new_start_index = start_index;
1927 *border = (ranges->at(start_index) & ~kMask) + kSize;
1928 while (*new_start_index < end_index) {
1929 if (ranges->at(*new_start_index) > *border) break;
1930 (*new_start_index)++;
1931 }
1932 // new_start_index is the index of the first edge that is beyond the
1933 // current kSize space.
1934
1935 // For very large search spaces we do a binary chop search of the non-Latin1
1936 // space instead of just going to the end of the current kSize space. The
1937 // heuristics are complicated a little by the fact that any 128-character
1938 // encoding space can be quickly tested with a table lookup, so we don't
1939 // wish to do binary chop search at a smaller granularity than that. A
1940 // 128-character space can take up a lot of space in the ranges array if,
1941 // for example, we only want to match every second character (eg. the lower
1942 // case characters on some Unicode pages).
1943 int binary_chop_index = (end_index + start_index) / 2;
1944 // The first test ensures that we get to the code that handles the Latin1
1945 // range with a single not-taken branch, speeding up this important
1946 // character range (even non-Latin1 charset-based text has spaces and
1947 // punctuation).
1948 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1949 end_index - start_index > (*new_start_index - start_index) * 2 &&
1950 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1951 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1952 int scan_forward_for_section_border = binary_chop_index;;
1953 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1954
1955 while (scan_forward_for_section_border < end_index) {
1956 if (ranges->at(scan_forward_for_section_border) > new_border) {
1957 *new_start_index = scan_forward_for_section_border;
1958 *border = new_border;
1959 break;
1960 }
1961 scan_forward_for_section_border++;
1962 }
1963 }
1964
1965 DCHECK(*new_start_index > start_index);
1966 *new_end_index = *new_start_index - 1;
1967 if (ranges->at(*new_end_index) == *border) {
1968 (*new_end_index)--;
1969 }
1970 if (*border >= ranges->at(end_index)) {
1971 *border = ranges->at(end_index);
1972 *new_start_index = end_index; // Won't be used.
1973 *new_end_index = end_index - 1;
1974 }
1975 }
1976
1977
1978 // Gets a series of segment boundaries representing a character class. If the
1979 // character is in the range between an even and an odd boundary (counting from
1980 // start_index) then go to even_label, otherwise go to odd_label. We already
1981 // know that the character is in the range of min_char to max_char inclusive.
1982 // Either label can be NULL indicating backtracking. Either label can also be
1983 // equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1984 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1985 int start_index, int end_index, uc32 min_char,
1986 uc32 max_char, Label* fall_through,
1987 Label* even_label, Label* odd_label) {
1988 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1989 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1990
1991 int first = ranges->at(start_index);
1992 int last = ranges->at(end_index) - 1;
1993
1994 DCHECK_LT(min_char, first);
1995
1996 // Just need to test if the character is before or on-or-after
1997 // a particular character.
1998 if (start_index == end_index) {
1999 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2000 return;
2001 }
2002
2003 // Another almost trivial case: There is one interval in the middle that is
2004 // different from the end intervals.
2005 if (start_index + 1 == end_index) {
2006 EmitDoubleBoundaryTest(
2007 masm, first, last, fall_through, even_label, odd_label);
2008 return;
2009 }
2010
2011 // It's not worth using table lookup if there are very few intervals in the
2012 // character class.
2013 if (end_index - start_index <= 6) {
2014 // It is faster to test for individual characters, so we look for those
2015 // first, then try arbitrary ranges in the second round.
2016 static int kNoCutIndex = -1;
2017 int cut = kNoCutIndex;
2018 for (int i = start_index; i < end_index; i++) {
2019 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2020 cut = i;
2021 break;
2022 }
2023 }
2024 if (cut == kNoCutIndex) cut = start_index;
2025 CutOutRange(
2026 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2027 DCHECK_GE(end_index - start_index, 2);
2028 GenerateBranches(masm,
2029 ranges,
2030 start_index + 1,
2031 end_index - 1,
2032 min_char,
2033 max_char,
2034 fall_through,
2035 even_label,
2036 odd_label);
2037 return;
2038 }
2039
2040 // If there are a lot of intervals in the regexp, then we will use tables to
2041 // determine whether the character is inside or outside the character class.
2042 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2043
2044 if ((max_char >> kBits) == (min_char >> kBits)) {
2045 EmitUseLookupTable(masm,
2046 ranges,
2047 start_index,
2048 end_index,
2049 min_char,
2050 fall_through,
2051 even_label,
2052 odd_label);
2053 return;
2054 }
2055
2056 if ((min_char >> kBits) != (first >> kBits)) {
2057 masm->CheckCharacterLT(first, odd_label);
2058 GenerateBranches(masm,
2059 ranges,
2060 start_index + 1,
2061 end_index,
2062 first,
2063 max_char,
2064 fall_through,
2065 odd_label,
2066 even_label);
2067 return;
2068 }
2069
2070 int new_start_index = 0;
2071 int new_end_index = 0;
2072 int border = 0;
2073
2074 SplitSearchSpace(ranges,
2075 start_index,
2076 end_index,
2077 &new_start_index,
2078 &new_end_index,
2079 &border);
2080
2081 Label handle_rest;
2082 Label* above = &handle_rest;
2083 if (border == last + 1) {
2084 // We didn't find any section that started after the limit, so everything
2085 // above the border is one of the terminal labels.
2086 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2087 DCHECK(new_end_index == end_index - 1);
2088 }
2089
2090 DCHECK_LE(start_index, new_end_index);
2091 DCHECK_LE(new_start_index, end_index);
2092 DCHECK_LT(start_index, new_start_index);
2093 DCHECK_LT(new_end_index, end_index);
2094 DCHECK(new_end_index + 1 == new_start_index ||
2095 (new_end_index + 2 == new_start_index &&
2096 border == ranges->at(new_end_index + 1)));
2097 DCHECK_LT(min_char, border - 1);
2098 DCHECK_LT(border, max_char);
2099 DCHECK_LT(ranges->at(new_end_index), border);
2100 DCHECK(border < ranges->at(new_start_index) ||
2101 (border == ranges->at(new_start_index) &&
2102 new_start_index == end_index &&
2103 new_end_index == end_index - 1 &&
2104 border == last + 1));
2105 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2106
2107 masm->CheckCharacterGT(border - 1, above);
2108 Label dummy;
2109 GenerateBranches(masm,
2110 ranges,
2111 start_index,
2112 new_end_index,
2113 min_char,
2114 border - 1,
2115 &dummy,
2116 even_label,
2117 odd_label);
2118 if (handle_rest.is_linked()) {
2119 masm->Bind(&handle_rest);
2120 bool flip = (new_start_index & 1) != (start_index & 1);
2121 GenerateBranches(masm,
2122 ranges,
2123 new_start_index,
2124 end_index,
2125 border,
2126 max_char,
2127 &dummy,
2128 flip ? odd_label : even_label,
2129 flip ? even_label : odd_label);
2130 }
2131 }
2132
2133
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2134 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2135 RegExpCharacterClass* cc, bool one_byte,
2136 Label* on_failure, int cp_offset, bool check_offset,
2137 bool preloaded, Zone* zone) {
2138 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2139 CharacterRange::Canonicalize(ranges);
2140
2141 int max_char;
2142 if (one_byte) {
2143 max_char = String::kMaxOneByteCharCode;
2144 } else {
2145 max_char = String::kMaxUtf16CodeUnit;
2146 }
2147
2148 int range_count = ranges->length();
2149
2150 int last_valid_range = range_count - 1;
2151 while (last_valid_range >= 0) {
2152 CharacterRange& range = ranges->at(last_valid_range);
2153 if (range.from() <= max_char) {
2154 break;
2155 }
2156 last_valid_range--;
2157 }
2158
2159 if (last_valid_range < 0) {
2160 if (!cc->is_negated()) {
2161 macro_assembler->GoTo(on_failure);
2162 }
2163 if (check_offset) {
2164 macro_assembler->CheckPosition(cp_offset, on_failure);
2165 }
2166 return;
2167 }
2168
2169 if (last_valid_range == 0 &&
2170 ranges->at(0).IsEverything(max_char)) {
2171 if (cc->is_negated()) {
2172 macro_assembler->GoTo(on_failure);
2173 } else {
2174 // This is a common case hit by non-anchored expressions.
2175 if (check_offset) {
2176 macro_assembler->CheckPosition(cp_offset, on_failure);
2177 }
2178 }
2179 return;
2180 }
2181
2182 if (!preloaded) {
2183 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2184 }
2185
2186 if (cc->is_standard(zone) &&
2187 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2188 on_failure)) {
2189 return;
2190 }
2191
2192
2193 // A new list with ascending entries. Each entry is a code unit
2194 // where there is a boundary between code units that are part of
2195 // the class and code units that are not. Normally we insert an
2196 // entry at zero which goes to the failure label, but if there
2197 // was already one there we fall through for success on that entry.
2198 // Subsequent entries have alternating meaning (success/failure).
2199 ZoneList<int>* range_boundaries =
2200 new(zone) ZoneList<int>(last_valid_range, zone);
2201
2202 bool zeroth_entry_is_failure = !cc->is_negated();
2203
2204 for (int i = 0; i <= last_valid_range; i++) {
2205 CharacterRange& range = ranges->at(i);
2206 if (range.from() == 0) {
2207 DCHECK_EQ(i, 0);
2208 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2209 } else {
2210 range_boundaries->Add(range.from(), zone);
2211 }
2212 range_boundaries->Add(range.to() + 1, zone);
2213 }
2214 int end_index = range_boundaries->length() - 1;
2215 if (range_boundaries->at(end_index) > max_char) {
2216 end_index--;
2217 }
2218
2219 Label fall_through;
2220 GenerateBranches(macro_assembler,
2221 range_boundaries,
2222 0, // start_index.
2223 end_index,
2224 0, // min_char.
2225 max_char,
2226 &fall_through,
2227 zeroth_entry_is_failure ? &fall_through : on_failure,
2228 zeroth_entry_is_failure ? on_failure : &fall_through);
2229 macro_assembler->Bind(&fall_through);
2230 }
2231
2232
~RegExpNode()2233 RegExpNode::~RegExpNode() {
2234 }
2235
2236
LimitVersions(RegExpCompiler * compiler,Trace * trace)2237 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2238 Trace* trace) {
2239 // If we are generating a greedy loop then don't stop and don't reuse code.
2240 if (trace->stop_node() != NULL) {
2241 return CONTINUE;
2242 }
2243
2244 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2245 if (trace->is_trivial()) {
2246 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2247 // If a generic version is already scheduled to be generated or we have
2248 // recursed too deeply then just generate a jump to that code.
2249 macro_assembler->GoTo(&label_);
2250 // This will queue it up for generation of a generic version if it hasn't
2251 // already been queued.
2252 compiler->AddWork(this);
2253 return DONE;
2254 }
2255 // Generate generic version of the node and bind the label for later use.
2256 macro_assembler->Bind(&label_);
2257 return CONTINUE;
2258 }
2259
2260 // We are being asked to make a non-generic version. Keep track of how many
2261 // non-generic versions we generate so as not to overdo it.
2262 trace_count_++;
2263 if (KeepRecursing(compiler) && compiler->optimize() &&
2264 trace_count_ < kMaxCopiesCodeGenerated) {
2265 return CONTINUE;
2266 }
2267
2268 // If we get here code has been generated for this node too many times or
2269 // recursion is too deep. Time to switch to a generic version. The code for
2270 // generic versions above can handle deep recursion properly.
2271 bool was_limiting = compiler->limiting_recursion();
2272 compiler->set_limiting_recursion(true);
2273 trace->Flush(compiler, this);
2274 compiler->set_limiting_recursion(was_limiting);
2275 return DONE;
2276 }
2277
2278
KeepRecursing(RegExpCompiler * compiler)2279 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2280 return !compiler->limiting_recursion() &&
2281 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2282 }
2283
2284
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2285 int ActionNode::EatsAtLeast(int still_to_find,
2286 int budget,
2287 bool not_at_start) {
2288 if (budget <= 0) return 0;
2289 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2290 return on_success()->EatsAtLeast(still_to_find,
2291 budget - 1,
2292 not_at_start);
2293 }
2294
2295
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2296 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2297 BoyerMooreLookahead* bm, bool not_at_start) {
2298 if (action_type_ == BEGIN_SUBMATCH) {
2299 bm->SetRest(offset);
2300 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2301 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2302 }
2303 SaveBMInfo(bm, not_at_start, offset);
2304 }
2305
2306
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2307 int AssertionNode::EatsAtLeast(int still_to_find,
2308 int budget,
2309 bool not_at_start) {
2310 if (budget <= 0) return 0;
2311 // If we know we are not at the start and we are asked "how many characters
2312 // will you match if you succeed?" then we can answer anything since false
2313 // implies false. So lets just return the max answer (still_to_find) since
2314 // that won't prevent us from preloading a lot of characters for the other
2315 // branches in the node graph.
2316 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2317 return on_success()->EatsAtLeast(still_to_find,
2318 budget - 1,
2319 not_at_start);
2320 }
2321
2322
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2323 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2324 BoyerMooreLookahead* bm, bool not_at_start) {
2325 // Match the behaviour of EatsAtLeast on this node.
2326 if (assertion_type() == AT_START && not_at_start) return;
2327 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2328 SaveBMInfo(bm, not_at_start, offset);
2329 }
2330
2331
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2332 int BackReferenceNode::EatsAtLeast(int still_to_find,
2333 int budget,
2334 bool not_at_start) {
2335 if (read_backward()) return 0;
2336 if (budget <= 0) return 0;
2337 return on_success()->EatsAtLeast(still_to_find,
2338 budget - 1,
2339 not_at_start);
2340 }
2341
2342
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2343 int TextNode::EatsAtLeast(int still_to_find,
2344 int budget,
2345 bool not_at_start) {
2346 if (read_backward()) return 0;
2347 int answer = Length();
2348 if (answer >= still_to_find) return answer;
2349 if (budget <= 0) return answer;
2350 // We are not at start after this node so we set the last argument to 'true'.
2351 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2352 budget - 1,
2353 true);
2354 }
2355
2356
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2357 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2358 bool not_at_start) {
2359 if (budget <= 0) return 0;
2360 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2361 // afterwards.
2362 RegExpNode* node = alternatives_->at(1).node();
2363 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2364 }
2365
2366
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2367 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2368 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2369 bool not_at_start) {
2370 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2371 // afterwards.
2372 RegExpNode* node = alternatives_->at(1).node();
2373 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2374 }
2375
2376
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2377 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2378 int budget,
2379 RegExpNode* ignore_this_node,
2380 bool not_at_start) {
2381 if (budget <= 0) return 0;
2382 int min = 100;
2383 int choice_count = alternatives_->length();
2384 budget = (budget - 1) / choice_count;
2385 for (int i = 0; i < choice_count; i++) {
2386 RegExpNode* node = alternatives_->at(i).node();
2387 if (node == ignore_this_node) continue;
2388 int node_eats_at_least =
2389 node->EatsAtLeast(still_to_find, budget, not_at_start);
2390 if (node_eats_at_least < min) min = node_eats_at_least;
2391 if (min == 0) return 0;
2392 }
2393 return min;
2394 }
2395
2396
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2397 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2398 int budget,
2399 bool not_at_start) {
2400 return EatsAtLeastHelper(still_to_find,
2401 budget - 1,
2402 loop_node_,
2403 not_at_start);
2404 }
2405
2406
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2407 int ChoiceNode::EatsAtLeast(int still_to_find,
2408 int budget,
2409 bool not_at_start) {
2410 return EatsAtLeastHelper(still_to_find,
2411 budget,
2412 NULL,
2413 not_at_start);
2414 }
2415
2416
2417 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2418 static inline uint32_t SmearBitsRight(uint32_t v) {
2419 v |= v >> 1;
2420 v |= v >> 2;
2421 v |= v >> 4;
2422 v |= v >> 8;
2423 v |= v >> 16;
2424 return v;
2425 }
2426
2427
Rationalize(bool asc)2428 bool QuickCheckDetails::Rationalize(bool asc) {
2429 bool found_useful_op = false;
2430 uint32_t char_mask;
2431 if (asc) {
2432 char_mask = String::kMaxOneByteCharCode;
2433 } else {
2434 char_mask = String::kMaxUtf16CodeUnit;
2435 }
2436 mask_ = 0;
2437 value_ = 0;
2438 int char_shift = 0;
2439 for (int i = 0; i < characters_; i++) {
2440 Position* pos = &positions_[i];
2441 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2442 found_useful_op = true;
2443 }
2444 mask_ |= (pos->mask & char_mask) << char_shift;
2445 value_ |= (pos->value & char_mask) << char_shift;
2446 char_shift += asc ? 8 : 16;
2447 }
2448 return found_useful_op;
2449 }
2450
2451
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2452 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2453 Trace* bounds_check_trace,
2454 Trace* trace,
2455 bool preload_has_checked_bounds,
2456 Label* on_possible_success,
2457 QuickCheckDetails* details,
2458 bool fall_through_on_failure) {
2459 if (details->characters() == 0) return false;
2460 GetQuickCheckDetails(
2461 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2462 if (details->cannot_match()) return false;
2463 if (!details->Rationalize(compiler->one_byte())) return false;
2464 DCHECK(details->characters() == 1 ||
2465 compiler->macro_assembler()->CanReadUnaligned());
2466 uint32_t mask = details->mask();
2467 uint32_t value = details->value();
2468
2469 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2470
2471 if (trace->characters_preloaded() != details->characters()) {
2472 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2473 // We are attempting to preload the minimum number of characters
2474 // any choice would eat, so if the bounds check fails, then none of the
2475 // choices can succeed, so we can just immediately backtrack, rather
2476 // than go to the next choice.
2477 assembler->LoadCurrentCharacter(trace->cp_offset(),
2478 bounds_check_trace->backtrack(),
2479 !preload_has_checked_bounds,
2480 details->characters());
2481 }
2482
2483
2484 bool need_mask = true;
2485
2486 if (details->characters() == 1) {
2487 // If number of characters preloaded is 1 then we used a byte or 16 bit
2488 // load so the value is already masked down.
2489 uint32_t char_mask;
2490 if (compiler->one_byte()) {
2491 char_mask = String::kMaxOneByteCharCode;
2492 } else {
2493 char_mask = String::kMaxUtf16CodeUnit;
2494 }
2495 if ((mask & char_mask) == char_mask) need_mask = false;
2496 mask &= char_mask;
2497 } else {
2498 // For 2-character preloads in one-byte mode or 1-character preloads in
2499 // two-byte mode we also use a 16 bit load with zero extend.
2500 static const uint32_t kTwoByteMask = 0xffff;
2501 static const uint32_t kFourByteMask = 0xffffffff;
2502 if (details->characters() == 2 && compiler->one_byte()) {
2503 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2504 } else if (details->characters() == 1 && !compiler->one_byte()) {
2505 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2506 } else {
2507 if (mask == kFourByteMask) need_mask = false;
2508 }
2509 }
2510
2511 if (fall_through_on_failure) {
2512 if (need_mask) {
2513 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2514 } else {
2515 assembler->CheckCharacter(value, on_possible_success);
2516 }
2517 } else {
2518 if (need_mask) {
2519 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2520 } else {
2521 assembler->CheckNotCharacter(value, trace->backtrack());
2522 }
2523 }
2524 return true;
2525 }
2526
2527
2528 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2529 // super-class in the .h file).
2530 //
2531 // We iterate along the text object, building up for each character a
2532 // mask and value that can be used to test for a quick failure to match.
2533 // The masks and values for the positions will be combined into a single
2534 // machine word for the current character width in order to be used in
2535 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2536 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2537 RegExpCompiler* compiler,
2538 int characters_filled_in,
2539 bool not_at_start) {
2540 // Do not collect any quick check details if the text node reads backward,
2541 // since it reads in the opposite direction than we use for quick checks.
2542 if (read_backward()) return;
2543 Isolate* isolate = compiler->macro_assembler()->isolate();
2544 DCHECK(characters_filled_in < details->characters());
2545 int characters = details->characters();
2546 int char_mask;
2547 if (compiler->one_byte()) {
2548 char_mask = String::kMaxOneByteCharCode;
2549 } else {
2550 char_mask = String::kMaxUtf16CodeUnit;
2551 }
2552 for (int k = 0; k < elements()->length(); k++) {
2553 TextElement elm = elements()->at(k);
2554 if (elm.text_type() == TextElement::ATOM) {
2555 Vector<const uc16> quarks = elm.atom()->data();
2556 for (int i = 0; i < characters && i < quarks.length(); i++) {
2557 QuickCheckDetails::Position* pos =
2558 details->positions(characters_filled_in);
2559 uc16 c = quarks[i];
2560 if (compiler->ignore_case()) {
2561 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2562 int length = GetCaseIndependentLetters(isolate, c,
2563 compiler->one_byte(), chars);
2564 if (length == 0) {
2565 // This can happen because all case variants are non-Latin1, but we
2566 // know the input is Latin1.
2567 details->set_cannot_match();
2568 pos->determines_perfectly = false;
2569 return;
2570 }
2571 if (length == 1) {
2572 // This letter has no case equivalents, so it's nice and simple
2573 // and the mask-compare will determine definitely whether we have
2574 // a match at this character position.
2575 pos->mask = char_mask;
2576 pos->value = c;
2577 pos->determines_perfectly = true;
2578 } else {
2579 uint32_t common_bits = char_mask;
2580 uint32_t bits = chars[0];
2581 for (int j = 1; j < length; j++) {
2582 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2583 common_bits ^= differing_bits;
2584 bits &= common_bits;
2585 }
2586 // If length is 2 and common bits has only one zero in it then
2587 // our mask and compare instruction will determine definitely
2588 // whether we have a match at this character position. Otherwise
2589 // it can only be an approximate check.
2590 uint32_t one_zero = (common_bits | ~char_mask);
2591 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2592 pos->determines_perfectly = true;
2593 }
2594 pos->mask = common_bits;
2595 pos->value = bits;
2596 }
2597 } else {
2598 // Don't ignore case. Nice simple case where the mask-compare will
2599 // determine definitely whether we have a match at this character
2600 // position.
2601 if (c > char_mask) {
2602 details->set_cannot_match();
2603 pos->determines_perfectly = false;
2604 return;
2605 }
2606 pos->mask = char_mask;
2607 pos->value = c;
2608 pos->determines_perfectly = true;
2609 }
2610 characters_filled_in++;
2611 DCHECK(characters_filled_in <= details->characters());
2612 if (characters_filled_in == details->characters()) {
2613 return;
2614 }
2615 }
2616 } else {
2617 QuickCheckDetails::Position* pos =
2618 details->positions(characters_filled_in);
2619 RegExpCharacterClass* tree = elm.char_class();
2620 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2621 if (tree->is_negated()) {
2622 // A quick check uses multi-character mask and compare. There is no
2623 // useful way to incorporate a negative char class into this scheme
2624 // so we just conservatively create a mask and value that will always
2625 // succeed.
2626 pos->mask = 0;
2627 pos->value = 0;
2628 } else {
2629 int first_range = 0;
2630 while (ranges->at(first_range).from() > char_mask) {
2631 first_range++;
2632 if (first_range == ranges->length()) {
2633 details->set_cannot_match();
2634 pos->determines_perfectly = false;
2635 return;
2636 }
2637 }
2638 CharacterRange range = ranges->at(first_range);
2639 uc16 from = range.from();
2640 uc16 to = range.to();
2641 if (to > char_mask) {
2642 to = char_mask;
2643 }
2644 uint32_t differing_bits = (from ^ to);
2645 // A mask and compare is only perfect if the differing bits form a
2646 // number like 00011111 with one single block of trailing 1s.
2647 if ((differing_bits & (differing_bits + 1)) == 0 &&
2648 from + differing_bits == to) {
2649 pos->determines_perfectly = true;
2650 }
2651 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2652 uint32_t bits = (from & common_bits);
2653 for (int i = first_range + 1; i < ranges->length(); i++) {
2654 CharacterRange range = ranges->at(i);
2655 uc16 from = range.from();
2656 uc16 to = range.to();
2657 if (from > char_mask) continue;
2658 if (to > char_mask) to = char_mask;
2659 // Here we are combining more ranges into the mask and compare
2660 // value. With each new range the mask becomes more sparse and
2661 // so the chances of a false positive rise. A character class
2662 // with multiple ranges is assumed never to be equivalent to a
2663 // mask and compare operation.
2664 pos->determines_perfectly = false;
2665 uint32_t new_common_bits = (from ^ to);
2666 new_common_bits = ~SmearBitsRight(new_common_bits);
2667 common_bits &= new_common_bits;
2668 bits &= new_common_bits;
2669 uint32_t differing_bits = (from & common_bits) ^ bits;
2670 common_bits ^= differing_bits;
2671 bits &= common_bits;
2672 }
2673 pos->mask = common_bits;
2674 pos->value = bits;
2675 }
2676 characters_filled_in++;
2677 DCHECK(characters_filled_in <= details->characters());
2678 if (characters_filled_in == details->characters()) {
2679 return;
2680 }
2681 }
2682 }
2683 DCHECK(characters_filled_in != details->characters());
2684 if (!details->cannot_match()) {
2685 on_success()-> GetQuickCheckDetails(details,
2686 compiler,
2687 characters_filled_in,
2688 true);
2689 }
2690 }
2691
2692
Clear()2693 void QuickCheckDetails::Clear() {
2694 for (int i = 0; i < characters_; i++) {
2695 positions_[i].mask = 0;
2696 positions_[i].value = 0;
2697 positions_[i].determines_perfectly = false;
2698 }
2699 characters_ = 0;
2700 }
2701
2702
Advance(int by,bool one_byte)2703 void QuickCheckDetails::Advance(int by, bool one_byte) {
2704 if (by >= characters_ || by < 0) {
2705 DCHECK_IMPLIES(by < 0, characters_ == 0);
2706 Clear();
2707 return;
2708 }
2709 DCHECK_LE(characters_ - by, 4);
2710 DCHECK_LE(characters_, 4);
2711 for (int i = 0; i < characters_ - by; i++) {
2712 positions_[i] = positions_[by + i];
2713 }
2714 for (int i = characters_ - by; i < characters_; i++) {
2715 positions_[i].mask = 0;
2716 positions_[i].value = 0;
2717 positions_[i].determines_perfectly = false;
2718 }
2719 characters_ -= by;
2720 // We could change mask_ and value_ here but we would never advance unless
2721 // they had already been used in a check and they won't be used again because
2722 // it would gain us nothing. So there's no point.
2723 }
2724
2725
Merge(QuickCheckDetails * other,int from_index)2726 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2727 DCHECK(characters_ == other->characters_);
2728 if (other->cannot_match_) {
2729 return;
2730 }
2731 if (cannot_match_) {
2732 *this = *other;
2733 return;
2734 }
2735 for (int i = from_index; i < characters_; i++) {
2736 QuickCheckDetails::Position* pos = positions(i);
2737 QuickCheckDetails::Position* other_pos = other->positions(i);
2738 if (pos->mask != other_pos->mask ||
2739 pos->value != other_pos->value ||
2740 !other_pos->determines_perfectly) {
2741 // Our mask-compare operation will be approximate unless we have the
2742 // exact same operation on both sides of the alternation.
2743 pos->determines_perfectly = false;
2744 }
2745 pos->mask &= other_pos->mask;
2746 pos->value &= pos->mask;
2747 other_pos->value &= pos->mask;
2748 uc16 differing_bits = (pos->value ^ other_pos->value);
2749 pos->mask &= ~differing_bits;
2750 pos->value &= pos->mask;
2751 }
2752 }
2753
2754
2755 class VisitMarker {
2756 public:
VisitMarker(NodeInfo * info)2757 explicit VisitMarker(NodeInfo* info) : info_(info) {
2758 DCHECK(!info->visited);
2759 info->visited = true;
2760 }
~VisitMarker()2761 ~VisitMarker() {
2762 info_->visited = false;
2763 }
2764 private:
2765 NodeInfo* info_;
2766 };
2767
2768
FilterOneByte(int depth,bool ignore_case)2769 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2770 if (info()->replacement_calculated) return replacement();
2771 if (depth < 0) return this;
2772 DCHECK(!info()->visited);
2773 VisitMarker marker(info());
2774 return FilterSuccessor(depth - 1, ignore_case);
2775 }
2776
2777
FilterSuccessor(int depth,bool ignore_case)2778 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2779 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2780 if (next == NULL) return set_replacement(NULL);
2781 on_success_ = next;
2782 return set_replacement(this);
2783 }
2784
2785
2786 // We need to check for the following characters: 0x39c 0x3bc 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2787 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2788 // TODO(dcarney): this could be a lot more efficient.
2789 return range.Contains(0x39c) ||
2790 range.Contains(0x3bc) || range.Contains(0x178);
2791 }
2792
2793
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2794 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2795 for (int i = 0; i < ranges->length(); i++) {
2796 // TODO(dcarney): this could be a lot more efficient.
2797 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2798 }
2799 return false;
2800 }
2801
2802
FilterOneByte(int depth,bool ignore_case)2803 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2804 if (info()->replacement_calculated) return replacement();
2805 if (depth < 0) return this;
2806 DCHECK(!info()->visited);
2807 VisitMarker marker(info());
2808 int element_count = elements()->length();
2809 for (int i = 0; i < element_count; i++) {
2810 TextElement elm = elements()->at(i);
2811 if (elm.text_type() == TextElement::ATOM) {
2812 Vector<const uc16> quarks = elm.atom()->data();
2813 for (int j = 0; j < quarks.length(); j++) {
2814 uint16_t c = quarks[j];
2815 if (c <= String::kMaxOneByteCharCode) continue;
2816 if (!ignore_case) return set_replacement(NULL);
2817 // Here, we need to check for characters whose upper and lower cases
2818 // are outside the Latin-1 range.
2819 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2820 // Character is outside Latin-1 completely
2821 if (converted == 0) return set_replacement(NULL);
2822 // Convert quark to Latin-1 in place.
2823 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2824 copy[j] = converted;
2825 }
2826 } else {
2827 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2828 RegExpCharacterClass* cc = elm.char_class();
2829 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2830 CharacterRange::Canonicalize(ranges);
2831 // Now they are in order so we only need to look at the first.
2832 int range_count = ranges->length();
2833 if (cc->is_negated()) {
2834 if (range_count != 0 &&
2835 ranges->at(0).from() == 0 &&
2836 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2837 // This will be handled in a later filter.
2838 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2839 return set_replacement(NULL);
2840 }
2841 } else {
2842 if (range_count == 0 ||
2843 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2844 // This will be handled in a later filter.
2845 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2846 return set_replacement(NULL);
2847 }
2848 }
2849 }
2850 }
2851 return FilterSuccessor(depth - 1, ignore_case);
2852 }
2853
2854
FilterOneByte(int depth,bool ignore_case)2855 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2856 if (info()->replacement_calculated) return replacement();
2857 if (depth < 0) return this;
2858 if (info()->visited) return this;
2859 {
2860 VisitMarker marker(info());
2861
2862 RegExpNode* continue_replacement =
2863 continue_node_->FilterOneByte(depth - 1, ignore_case);
2864 // If we can't continue after the loop then there is no sense in doing the
2865 // loop.
2866 if (continue_replacement == NULL) return set_replacement(NULL);
2867 }
2868
2869 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2870 }
2871
2872
FilterOneByte(int depth,bool ignore_case)2873 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2874 if (info()->replacement_calculated) return replacement();
2875 if (depth < 0) return this;
2876 if (info()->visited) return this;
2877 VisitMarker marker(info());
2878 int choice_count = alternatives_->length();
2879
2880 for (int i = 0; i < choice_count; i++) {
2881 GuardedAlternative alternative = alternatives_->at(i);
2882 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2883 set_replacement(this);
2884 return this;
2885 }
2886 }
2887
2888 int surviving = 0;
2889 RegExpNode* survivor = NULL;
2890 for (int i = 0; i < choice_count; i++) {
2891 GuardedAlternative alternative = alternatives_->at(i);
2892 RegExpNode* replacement =
2893 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2894 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2895 if (replacement != NULL) {
2896 alternatives_->at(i).set_node(replacement);
2897 surviving++;
2898 survivor = replacement;
2899 }
2900 }
2901 if (surviving < 2) return set_replacement(survivor);
2902
2903 set_replacement(this);
2904 if (surviving == choice_count) {
2905 return this;
2906 }
2907 // Only some of the nodes survived the filtering. We need to rebuild the
2908 // alternatives list.
2909 ZoneList<GuardedAlternative>* new_alternatives =
2910 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2911 for (int i = 0; i < choice_count; i++) {
2912 RegExpNode* replacement =
2913 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2914 if (replacement != NULL) {
2915 alternatives_->at(i).set_node(replacement);
2916 new_alternatives->Add(alternatives_->at(i), zone());
2917 }
2918 }
2919 alternatives_ = new_alternatives;
2920 return this;
2921 }
2922
2923
FilterOneByte(int depth,bool ignore_case)2924 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2925 bool ignore_case) {
2926 if (info()->replacement_calculated) return replacement();
2927 if (depth < 0) return this;
2928 if (info()->visited) return this;
2929 VisitMarker marker(info());
2930 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2931 // afterwards.
2932 RegExpNode* node = alternatives_->at(1).node();
2933 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2934 if (replacement == NULL) return set_replacement(NULL);
2935 alternatives_->at(1).set_node(replacement);
2936
2937 RegExpNode* neg_node = alternatives_->at(0).node();
2938 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2939 // If the negative lookahead is always going to fail then
2940 // we don't need to check it.
2941 if (neg_replacement == NULL) return set_replacement(replacement);
2942 alternatives_->at(0).set_node(neg_replacement);
2943 return set_replacement(this);
2944 }
2945
2946
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2947 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2948 RegExpCompiler* compiler,
2949 int characters_filled_in,
2950 bool not_at_start) {
2951 if (body_can_be_zero_length_ || info()->visited) return;
2952 VisitMarker marker(info());
2953 return ChoiceNode::GetQuickCheckDetails(details,
2954 compiler,
2955 characters_filled_in,
2956 not_at_start);
2957 }
2958
2959
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2960 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2961 BoyerMooreLookahead* bm, bool not_at_start) {
2962 if (body_can_be_zero_length_ || budget <= 0) {
2963 bm->SetRest(offset);
2964 SaveBMInfo(bm, not_at_start, offset);
2965 return;
2966 }
2967 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2968 SaveBMInfo(bm, not_at_start, offset);
2969 }
2970
2971
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2972 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2973 RegExpCompiler* compiler,
2974 int characters_filled_in,
2975 bool not_at_start) {
2976 not_at_start = (not_at_start || not_at_start_);
2977 int choice_count = alternatives_->length();
2978 DCHECK(choice_count > 0);
2979 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2980 compiler,
2981 characters_filled_in,
2982 not_at_start);
2983 for (int i = 1; i < choice_count; i++) {
2984 QuickCheckDetails new_details(details->characters());
2985 RegExpNode* node = alternatives_->at(i).node();
2986 node->GetQuickCheckDetails(&new_details, compiler,
2987 characters_filled_in,
2988 not_at_start);
2989 // Here we merge the quick match details of the two branches.
2990 details->Merge(&new_details, characters_filled_in);
2991 }
2992 }
2993
2994
2995 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2996 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2997 Label* word,
2998 Label* non_word,
2999 bool fall_through_on_word) {
3000 if (assembler->CheckSpecialCharacterClass(
3001 fall_through_on_word ? 'w' : 'W',
3002 fall_through_on_word ? non_word : word)) {
3003 // Optimized implementation available.
3004 return;
3005 }
3006 assembler->CheckCharacterGT('z', non_word);
3007 assembler->CheckCharacterLT('0', non_word);
3008 assembler->CheckCharacterGT('a' - 1, word);
3009 assembler->CheckCharacterLT('9' + 1, word);
3010 assembler->CheckCharacterLT('A', non_word);
3011 assembler->CheckCharacterLT('Z' + 1, word);
3012 if (fall_through_on_word) {
3013 assembler->CheckNotCharacter('_', non_word);
3014 } else {
3015 assembler->CheckCharacter('_', word);
3016 }
3017 }
3018
3019
3020 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3021 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)3022 static void EmitHat(RegExpCompiler* compiler,
3023 RegExpNode* on_success,
3024 Trace* trace) {
3025 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3026 // We will be loading the previous character into the current character
3027 // register.
3028 Trace new_trace(*trace);
3029 new_trace.InvalidateCurrentCharacter();
3030
3031 Label ok;
3032 if (new_trace.cp_offset() == 0) {
3033 // The start of input counts as a newline in this context, so skip to
3034 // ok if we are at the start.
3035 assembler->CheckAtStart(&ok);
3036 }
3037 // We already checked that we are not at the start of input so it must be
3038 // OK to load the previous character.
3039 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3040 new_trace.backtrack(),
3041 false);
3042 if (!assembler->CheckSpecialCharacterClass('n',
3043 new_trace.backtrack())) {
3044 // Newline means \n, \r, 0x2028 or 0x2029.
3045 if (!compiler->one_byte()) {
3046 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3047 }
3048 assembler->CheckCharacter('\n', &ok);
3049 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3050 }
3051 assembler->Bind(&ok);
3052 on_success->Emit(compiler, &new_trace);
3053 }
3054
3055
3056 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)3057 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3058 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3059 Isolate* isolate = assembler->isolate();
3060 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3061 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3062 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3063 if (lookahead == NULL) {
3064 int eats_at_least =
3065 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3066 kRecursionBudget,
3067 not_at_start));
3068 if (eats_at_least >= 1) {
3069 BoyerMooreLookahead* bm =
3070 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3071 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3072 if (bm->at(0)->is_non_word())
3073 next_is_word_character = Trace::FALSE_VALUE;
3074 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3075 }
3076 } else {
3077 if (lookahead->at(0)->is_non_word())
3078 next_is_word_character = Trace::FALSE_VALUE;
3079 if (lookahead->at(0)->is_word())
3080 next_is_word_character = Trace::TRUE_VALUE;
3081 }
3082 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3083 if (next_is_word_character == Trace::UNKNOWN) {
3084 Label before_non_word;
3085 Label before_word;
3086 if (trace->characters_preloaded() != 1) {
3087 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3088 }
3089 // Fall through on non-word.
3090 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3091 // Next character is not a word character.
3092 assembler->Bind(&before_non_word);
3093 Label ok;
3094 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3095 assembler->GoTo(&ok);
3096
3097 assembler->Bind(&before_word);
3098 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3099 assembler->Bind(&ok);
3100 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3101 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3102 } else {
3103 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3104 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3105 }
3106 }
3107
3108
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3109 void AssertionNode::BacktrackIfPrevious(
3110 RegExpCompiler* compiler,
3111 Trace* trace,
3112 AssertionNode::IfPrevious backtrack_if_previous) {
3113 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3114 Trace new_trace(*trace);
3115 new_trace.InvalidateCurrentCharacter();
3116
3117 Label fall_through, dummy;
3118
3119 Label* non_word = backtrack_if_previous == kIsNonWord ?
3120 new_trace.backtrack() :
3121 &fall_through;
3122 Label* word = backtrack_if_previous == kIsNonWord ?
3123 &fall_through :
3124 new_trace.backtrack();
3125
3126 if (new_trace.cp_offset() == 0) {
3127 // The start of input counts as a non-word character, so the question is
3128 // decided if we are at the start.
3129 assembler->CheckAtStart(non_word);
3130 }
3131 // We already checked that we are not at the start of input so it must be
3132 // OK to load the previous character.
3133 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3134 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3135
3136 assembler->Bind(&fall_through);
3137 on_success()->Emit(compiler, &new_trace);
3138 }
3139
3140
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3141 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3142 RegExpCompiler* compiler,
3143 int filled_in,
3144 bool not_at_start) {
3145 if (assertion_type_ == AT_START && not_at_start) {
3146 details->set_cannot_match();
3147 return;
3148 }
3149 return on_success()->GetQuickCheckDetails(details,
3150 compiler,
3151 filled_in,
3152 not_at_start);
3153 }
3154
3155
Emit(RegExpCompiler * compiler,Trace * trace)3156 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3157 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3158 switch (assertion_type_) {
3159 case AT_END: {
3160 Label ok;
3161 assembler->CheckPosition(trace->cp_offset(), &ok);
3162 assembler->GoTo(trace->backtrack());
3163 assembler->Bind(&ok);
3164 break;
3165 }
3166 case AT_START: {
3167 if (trace->at_start() == Trace::FALSE_VALUE) {
3168 assembler->GoTo(trace->backtrack());
3169 return;
3170 }
3171 if (trace->at_start() == Trace::UNKNOWN) {
3172 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3173 Trace at_start_trace = *trace;
3174 at_start_trace.set_at_start(Trace::TRUE_VALUE);
3175 on_success()->Emit(compiler, &at_start_trace);
3176 return;
3177 }
3178 }
3179 break;
3180 case AFTER_NEWLINE:
3181 EmitHat(compiler, on_success(), trace);
3182 return;
3183 case AT_BOUNDARY:
3184 case AT_NON_BOUNDARY: {
3185 EmitBoundaryCheck(compiler, trace);
3186 return;
3187 }
3188 }
3189 on_success()->Emit(compiler, trace);
3190 }
3191
3192
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3193 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3194 if (quick_check == NULL) return false;
3195 if (offset >= quick_check->characters()) return false;
3196 return quick_check->positions(offset)->determines_perfectly;
3197 }
3198
3199
UpdateBoundsCheck(int index,int * checked_up_to)3200 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3201 if (index > *checked_up_to) {
3202 *checked_up_to = index;
3203 }
3204 }
3205
3206
3207 // We call this repeatedly to generate code for each pass over the text node.
3208 // The passes are in increasing order of difficulty because we hope one
3209 // of the first passes will fail in which case we are saved the work of the
3210 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3211 // we will check the '%' in the first pass, the case independent 'a' in the
3212 // second pass and the character class in the last pass.
3213 //
3214 // The passes are done from right to left, so for example to test for /bar/
3215 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3216 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3217 // bounds check most of the time. In the example we only need to check for
3218 // end-of-input when loading the putative 'r'.
3219 //
3220 // A slight complication involves the fact that the first character may already
3221 // be fetched into a register by the previous node. In this case we want to
3222 // do the test for that character first. We do this in separate passes. The
3223 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3224 // pass has been performed then subsequent passes will have true in
3225 // first_element_checked to indicate that that character does not need to be
3226 // checked again.
3227 //
3228 // In addition to all this we are passed a Trace, which can
3229 // contain an AlternativeGeneration object. In this AlternativeGeneration
3230 // object we can see details of any quick check that was already passed in
3231 // order to get to the code we are now generating. The quick check can involve
3232 // loading characters, which means we do not need to recheck the bounds
3233 // up to the limit the quick check already checked. In addition the quick
3234 // check can have involved a mask and compare operation which may simplify
3235 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3236 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3237 TextEmitPassType pass,
3238 bool preloaded,
3239 Trace* trace,
3240 bool first_element_checked,
3241 int* checked_up_to) {
3242 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3243 Isolate* isolate = assembler->isolate();
3244 bool one_byte = compiler->one_byte();
3245 Label* backtrack = trace->backtrack();
3246 QuickCheckDetails* quick_check = trace->quick_check_performed();
3247 int element_count = elements()->length();
3248 int backward_offset = read_backward() ? -Length() : 0;
3249 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3250 TextElement elm = elements()->at(i);
3251 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3252 if (elm.text_type() == TextElement::ATOM) {
3253 Vector<const uc16> quarks = elm.atom()->data();
3254 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3255 if (first_element_checked && i == 0 && j == 0) continue;
3256 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3257 EmitCharacterFunction* emit_function = NULL;
3258 switch (pass) {
3259 case NON_LATIN1_MATCH:
3260 DCHECK(one_byte);
3261 if (quarks[j] > String::kMaxOneByteCharCode) {
3262 assembler->GoTo(backtrack);
3263 return;
3264 }
3265 break;
3266 case NON_LETTER_CHARACTER_MATCH:
3267 emit_function = &EmitAtomNonLetter;
3268 break;
3269 case SIMPLE_CHARACTER_MATCH:
3270 emit_function = &EmitSimpleCharacter;
3271 break;
3272 case CASE_CHARACTER_MATCH:
3273 emit_function = &EmitAtomLetter;
3274 break;
3275 default:
3276 break;
3277 }
3278 if (emit_function != NULL) {
3279 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3280 bool bound_checked =
3281 emit_function(isolate, compiler, quarks[j], backtrack,
3282 cp_offset + j, bounds_check, preloaded);
3283 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3284 }
3285 }
3286 } else {
3287 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3288 if (pass == CHARACTER_CLASS_MATCH) {
3289 if (first_element_checked && i == 0) continue;
3290 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3291 RegExpCharacterClass* cc = elm.char_class();
3292 bool bounds_check = *checked_up_to < cp_offset || read_backward();
3293 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3294 bounds_check, preloaded, zone());
3295 UpdateBoundsCheck(cp_offset, checked_up_to);
3296 }
3297 }
3298 }
3299 }
3300
3301
Length()3302 int TextNode::Length() {
3303 TextElement elm = elements()->last();
3304 DCHECK(elm.cp_offset() >= 0);
3305 return elm.cp_offset() + elm.length();
3306 }
3307
3308
SkipPass(int int_pass,bool ignore_case)3309 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3310 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3311 if (ignore_case) {
3312 return pass == SIMPLE_CHARACTER_MATCH;
3313 } else {
3314 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3315 }
3316 }
3317
3318
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success)3319 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3320 ZoneList<CharacterRange>* ranges,
3321 bool read_backward,
3322 RegExpNode* on_success) {
3323 DCHECK_NOT_NULL(ranges);
3324 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3325 elms->Add(
3326 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3327 zone);
3328 return new (zone) TextNode(elms, read_backward, on_success);
3329 }
3330
3331
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success)3332 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3333 CharacterRange trail,
3334 bool read_backward,
3335 RegExpNode* on_success) {
3336 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3337 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3338 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3339 elms->Add(TextElement::CharClass(
3340 new (zone) RegExpCharacterClass(lead_ranges, false)),
3341 zone);
3342 elms->Add(TextElement::CharClass(
3343 new (zone) RegExpCharacterClass(trail_ranges, false)),
3344 zone);
3345 return new (zone) TextNode(elms, read_backward, on_success);
3346 }
3347
3348
3349 // This generates the code to match a text node. A text node can contain
3350 // straight character sequences (possibly to be matched in a case-independent
3351 // way) and character classes. For efficiency we do not do this in a single
3352 // pass from left to right. Instead we pass over the text node several times,
3353 // emitting code for some character positions every time. See the comment on
3354 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3355 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3356 LimitResult limit_result = LimitVersions(compiler, trace);
3357 if (limit_result == DONE) return;
3358 DCHECK(limit_result == CONTINUE);
3359
3360 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3361 compiler->SetRegExpTooBig();
3362 return;
3363 }
3364
3365 if (compiler->one_byte()) {
3366 int dummy = 0;
3367 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3368 }
3369
3370 bool first_elt_done = false;
3371 int bound_checked_to = trace->cp_offset() - 1;
3372 bound_checked_to += trace->bound_checked_up_to();
3373
3374 // If a character is preloaded into the current character register then
3375 // check that now.
3376 if (trace->characters_preloaded() == 1) {
3377 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3378 if (!SkipPass(pass, compiler->ignore_case())) {
3379 TextEmitPass(compiler,
3380 static_cast<TextEmitPassType>(pass),
3381 true,
3382 trace,
3383 false,
3384 &bound_checked_to);
3385 }
3386 }
3387 first_elt_done = true;
3388 }
3389
3390 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3391 if (!SkipPass(pass, compiler->ignore_case())) {
3392 TextEmitPass(compiler,
3393 static_cast<TextEmitPassType>(pass),
3394 false,
3395 trace,
3396 first_elt_done,
3397 &bound_checked_to);
3398 }
3399 }
3400
3401 Trace successor_trace(*trace);
3402 // If we advance backward, we may end up at the start.
3403 successor_trace.AdvanceCurrentPositionInTrace(
3404 read_backward() ? -Length() : Length(), compiler);
3405 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3406 : Trace::FALSE_VALUE);
3407 RecursionCheck rc(compiler);
3408 on_success()->Emit(compiler, &successor_trace);
3409 }
3410
3411
InvalidateCurrentCharacter()3412 void Trace::InvalidateCurrentCharacter() {
3413 characters_preloaded_ = 0;
3414 }
3415
3416
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3417 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3418 // We don't have an instruction for shifting the current character register
3419 // down or for using a shifted value for anything so lets just forget that
3420 // we preloaded any characters into it.
3421 characters_preloaded_ = 0;
3422 // Adjust the offsets of the quick check performed information. This
3423 // information is used to find out what we already determined about the
3424 // characters by means of mask and compare.
3425 quick_check_performed_.Advance(by, compiler->one_byte());
3426 cp_offset_ += by;
3427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3428 compiler->SetRegExpTooBig();
3429 cp_offset_ = 0;
3430 }
3431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3432 }
3433
3434
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)3435 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3436 int element_count = elements()->length();
3437 for (int i = 0; i < element_count; i++) {
3438 TextElement elm = elements()->at(i);
3439 if (elm.text_type() == TextElement::CHAR_CLASS) {
3440 RegExpCharacterClass* cc = elm.char_class();
3441 // None of the standard character classes is different in the case
3442 // independent case and it slows us down if we don't know that.
3443 if (cc->is_standard(zone())) continue;
3444 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3445 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3446 }
3447 }
3448 }
3449
3450
GreedyLoopTextLength()3451 int TextNode::GreedyLoopTextLength() { return Length(); }
3452
3453
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3454 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3455 RegExpCompiler* compiler) {
3456 if (read_backward()) return NULL;
3457 if (elements()->length() != 1) return NULL;
3458 TextElement elm = elements()->at(0);
3459 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3460 RegExpCharacterClass* node = elm.char_class();
3461 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3462 CharacterRange::Canonicalize(ranges);
3463 if (node->is_negated()) {
3464 return ranges->length() == 0 ? on_success() : NULL;
3465 }
3466 if (ranges->length() != 1) return NULL;
3467 uint32_t max_char;
3468 if (compiler->one_byte()) {
3469 max_char = String::kMaxOneByteCharCode;
3470 } else {
3471 max_char = String::kMaxUtf16CodeUnit;
3472 }
3473 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3474 }
3475
3476
3477 // Finds the fixed match length of a sequence of nodes that goes from
3478 // this alternative and back to this choice node. If there are variable
3479 // length nodes or other complications in the way then return a sentinel
3480 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3481 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3482 GuardedAlternative* alternative) {
3483 int length = 0;
3484 RegExpNode* node = alternative->node();
3485 // Later we will generate code for all these text nodes using recursion
3486 // so we have to limit the max number.
3487 int recursion_depth = 0;
3488 while (node != this) {
3489 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3490 return kNodeIsTooComplexForGreedyLoops;
3491 }
3492 int node_length = node->GreedyLoopTextLength();
3493 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3494 return kNodeIsTooComplexForGreedyLoops;
3495 }
3496 length += node_length;
3497 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3498 node = seq_node->on_success();
3499 }
3500 return read_backward() ? -length : length;
3501 }
3502
3503
AddLoopAlternative(GuardedAlternative alt)3504 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3505 DCHECK_NULL(loop_node_);
3506 AddAlternative(alt);
3507 loop_node_ = alt.node();
3508 }
3509
3510
AddContinueAlternative(GuardedAlternative alt)3511 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3512 DCHECK_NULL(continue_node_);
3513 AddAlternative(alt);
3514 continue_node_ = alt.node();
3515 }
3516
3517
Emit(RegExpCompiler * compiler,Trace * trace)3518 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3519 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3520 if (trace->stop_node() == this) {
3521 // Back edge of greedy optimized loop node graph.
3522 int text_length =
3523 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3524 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3525 // Update the counter-based backtracking info on the stack. This is an
3526 // optimization for greedy loops (see below).
3527 DCHECK(trace->cp_offset() == text_length);
3528 macro_assembler->AdvanceCurrentPosition(text_length);
3529 macro_assembler->GoTo(trace->loop_label());
3530 return;
3531 }
3532 DCHECK_NULL(trace->stop_node());
3533 if (!trace->is_trivial()) {
3534 trace->Flush(compiler, this);
3535 return;
3536 }
3537 ChoiceNode::Emit(compiler, trace);
3538 }
3539
3540
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3541 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3542 int eats_at_least) {
3543 int preload_characters = Min(4, eats_at_least);
3544 if (compiler->macro_assembler()->CanReadUnaligned()) {
3545 bool one_byte = compiler->one_byte();
3546 if (one_byte) {
3547 if (preload_characters > 4) preload_characters = 4;
3548 // We can't preload 3 characters because there is no machine instruction
3549 // to do that. We can't just load 4 because we could be reading
3550 // beyond the end of the string, which could cause a memory fault.
3551 if (preload_characters == 3) preload_characters = 2;
3552 } else {
3553 if (preload_characters > 2) preload_characters = 2;
3554 }
3555 } else {
3556 if (preload_characters > 1) preload_characters = 1;
3557 }
3558 return preload_characters;
3559 }
3560
3561
3562 // This class is used when generating the alternatives in a choice node. It
3563 // records the way the alternative is being code generated.
3564 class AlternativeGeneration: public Malloced {
3565 public:
AlternativeGeneration()3566 AlternativeGeneration()
3567 : possible_success(),
3568 expects_preload(false),
3569 after(),
3570 quick_check_details() { }
3571 Label possible_success;
3572 bool expects_preload;
3573 Label after;
3574 QuickCheckDetails quick_check_details;
3575 };
3576
3577
3578 // Creates a list of AlternativeGenerations. If the list has a reasonable
3579 // size then it is on the stack, otherwise the excess is on the heap.
3580 class AlternativeGenerationList {
3581 public:
AlternativeGenerationList(int count,Zone * zone)3582 AlternativeGenerationList(int count, Zone* zone)
3583 : alt_gens_(count, zone) {
3584 for (int i = 0; i < count && i < kAFew; i++) {
3585 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3586 }
3587 for (int i = kAFew; i < count; i++) {
3588 alt_gens_.Add(new AlternativeGeneration(), zone);
3589 }
3590 }
~AlternativeGenerationList()3591 ~AlternativeGenerationList() {
3592 for (int i = kAFew; i < alt_gens_.length(); i++) {
3593 delete alt_gens_[i];
3594 alt_gens_[i] = NULL;
3595 }
3596 }
3597
at(int i)3598 AlternativeGeneration* at(int i) {
3599 return alt_gens_[i];
3600 }
3601
3602 private:
3603 static const int kAFew = 10;
3604 ZoneList<AlternativeGeneration*> alt_gens_;
3605 AlternativeGeneration a_few_alt_gens_[kAFew];
3606 };
3607
3608
3609 static const uc32 kRangeEndMarker = 0x110000;
3610
3611 // The '2' variant is has inclusive from and exclusive to.
3612 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3613 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3614 static const int kSpaceRanges[] = {
3615 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
3616 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3617 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3618 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3619
3620 static const int kWordRanges[] = {
3621 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3622 static const int kWordRangeCount = arraysize(kWordRanges);
3623 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3624 static const int kDigitRangeCount = arraysize(kDigitRanges);
3625 static const int kSurrogateRanges[] = {
3626 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3627 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3628 static const int kLineTerminatorRanges[] = {
3629 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3630 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3631
Set(int character)3632 void BoyerMoorePositionInfo::Set(int character) {
3633 SetInterval(Interval(character, character));
3634 }
3635
3636
SetInterval(const Interval & interval)3637 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3638 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3639 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3640 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3641 surrogate_ =
3642 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3643 if (interval.to() - interval.from() >= kMapSize - 1) {
3644 if (map_count_ != kMapSize) {
3645 map_count_ = kMapSize;
3646 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3647 }
3648 return;
3649 }
3650 for (int i = interval.from(); i <= interval.to(); i++) {
3651 int mod_character = (i & kMask);
3652 if (!map_->at(mod_character)) {
3653 map_count_++;
3654 map_->at(mod_character) = true;
3655 }
3656 if (map_count_ == kMapSize) return;
3657 }
3658 }
3659
3660
SetAll()3661 void BoyerMoorePositionInfo::SetAll() {
3662 s_ = w_ = d_ = kLatticeUnknown;
3663 if (map_count_ != kMapSize) {
3664 map_count_ = kMapSize;
3665 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3666 }
3667 }
3668
3669
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3670 BoyerMooreLookahead::BoyerMooreLookahead(
3671 int length, RegExpCompiler* compiler, Zone* zone)
3672 : length_(length),
3673 compiler_(compiler) {
3674 if (compiler->one_byte()) {
3675 max_char_ = String::kMaxOneByteCharCode;
3676 } else {
3677 max_char_ = String::kMaxUtf16CodeUnit;
3678 }
3679 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3680 for (int i = 0; i < length; i++) {
3681 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3682 }
3683 }
3684
3685
3686 // Find the longest range of lookahead that has the fewest number of different
3687 // characters that can occur at a given position. Since we are optimizing two
3688 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3689 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3690 int biggest_points = 0;
3691 // If more than 32 characters out of 128 can occur it is unlikely that we can
3692 // be lucky enough to step forwards much of the time.
3693 const int kMaxMax = 32;
3694 for (int max_number_of_chars = 4;
3695 max_number_of_chars < kMaxMax;
3696 max_number_of_chars *= 2) {
3697 biggest_points =
3698 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3699 }
3700 if (biggest_points == 0) return false;
3701 return true;
3702 }
3703
3704
3705 // Find the highest-points range between 0 and length_ where the character
3706 // information is not too vague. 'Too vague' means that there are more than
3707 // max_number_of_chars that can occur at this position. Calculates the number
3708 // of points as the product of width-of-the-range and
3709 // probability-of-finding-one-of-the-characters, where the probability is
3710 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3711 int BoyerMooreLookahead::FindBestInterval(
3712 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3713 int biggest_points = old_biggest_points;
3714 static const int kSize = RegExpMacroAssembler::kTableSize;
3715 for (int i = 0; i < length_; ) {
3716 while (i < length_ && Count(i) > max_number_of_chars) i++;
3717 if (i == length_) break;
3718 int remembered_from = i;
3719 bool union_map[kSize];
3720 for (int j = 0; j < kSize; j++) union_map[j] = false;
3721 while (i < length_ && Count(i) <= max_number_of_chars) {
3722 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3723 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3724 i++;
3725 }
3726 int frequency = 0;
3727 for (int j = 0; j < kSize; j++) {
3728 if (union_map[j]) {
3729 // Add 1 to the frequency to give a small per-character boost for
3730 // the cases where our sampling is not good enough and many
3731 // characters have a frequency of zero. This means the frequency
3732 // can theoretically be up to 2*kSize though we treat it mostly as
3733 // a fraction of kSize.
3734 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3735 }
3736 }
3737 // We use the probability of skipping times the distance we are skipping to
3738 // judge the effectiveness of this. Actually we have a cut-off: By
3739 // dividing by 2 we switch off the skipping if the probability of skipping
3740 // is less than 50%. This is because the multibyte mask-and-compare
3741 // skipping in quickcheck is more likely to do well on this case.
3742 bool in_quickcheck_range =
3743 ((i - remembered_from < 4) ||
3744 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3745 // Called 'probability' but it is only a rough estimate and can actually
3746 // be outside the 0-kSize range.
3747 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3748 int points = (i - remembered_from) * probability;
3749 if (points > biggest_points) {
3750 *from = remembered_from;
3751 *to = i - 1;
3752 biggest_points = points;
3753 }
3754 }
3755 return biggest_points;
3756 }
3757
3758
3759 // Take all the characters that will not prevent a successful match if they
3760 // occur in the subject string in the range between min_lookahead and
3761 // max_lookahead (inclusive) measured from the current position. If the
3762 // character at max_lookahead offset is not one of these characters, then we
3763 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3764 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3765 int max_lookahead,
3766 Handle<ByteArray> boolean_skip_table) {
3767 const int kSize = RegExpMacroAssembler::kTableSize;
3768
3769 const int kSkipArrayEntry = 0;
3770 const int kDontSkipArrayEntry = 1;
3771
3772 for (int i = 0; i < kSize; i++) {
3773 boolean_skip_table->set(i, kSkipArrayEntry);
3774 }
3775 int skip = max_lookahead + 1 - min_lookahead;
3776
3777 for (int i = max_lookahead; i >= min_lookahead; i--) {
3778 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3779 for (int j = 0; j < kSize; j++) {
3780 if (map->at(j)) {
3781 boolean_skip_table->set(j, kDontSkipArrayEntry);
3782 }
3783 }
3784 }
3785
3786 return skip;
3787 }
3788
3789
3790 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3791 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3792 const int kSize = RegExpMacroAssembler::kTableSize;
3793
3794 int min_lookahead = 0;
3795 int max_lookahead = 0;
3796
3797 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3798
3799 bool found_single_character = false;
3800 int single_character = 0;
3801 for (int i = max_lookahead; i >= min_lookahead; i--) {
3802 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3803 if (map->map_count() > 1 ||
3804 (found_single_character && map->map_count() != 0)) {
3805 found_single_character = false;
3806 break;
3807 }
3808 for (int j = 0; j < kSize; j++) {
3809 if (map->at(j)) {
3810 found_single_character = true;
3811 single_character = j;
3812 break;
3813 }
3814 }
3815 }
3816
3817 int lookahead_width = max_lookahead + 1 - min_lookahead;
3818
3819 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3820 // The mask-compare can probably handle this better.
3821 return;
3822 }
3823
3824 if (found_single_character) {
3825 Label cont, again;
3826 masm->Bind(&again);
3827 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3828 if (max_char_ > kSize) {
3829 masm->CheckCharacterAfterAnd(single_character,
3830 RegExpMacroAssembler::kTableMask,
3831 &cont);
3832 } else {
3833 masm->CheckCharacter(single_character, &cont);
3834 }
3835 masm->AdvanceCurrentPosition(lookahead_width);
3836 masm->GoTo(&again);
3837 masm->Bind(&cont);
3838 return;
3839 }
3840
3841 Factory* factory = masm->isolate()->factory();
3842 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3843 int skip_distance = GetSkipTable(
3844 min_lookahead, max_lookahead, boolean_skip_table);
3845 DCHECK(skip_distance != 0);
3846
3847 Label cont, again;
3848 masm->Bind(&again);
3849 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3850 masm->CheckBitInTable(boolean_skip_table, &cont);
3851 masm->AdvanceCurrentPosition(skip_distance);
3852 masm->GoTo(&again);
3853 masm->Bind(&cont);
3854 }
3855
3856
3857 /* Code generation for choice nodes.
3858 *
3859 * We generate quick checks that do a mask and compare to eliminate a
3860 * choice. If the quick check succeeds then it jumps to the continuation to
3861 * do slow checks and check subsequent nodes. If it fails (the common case)
3862 * it falls through to the next choice.
3863 *
3864 * Here is the desired flow graph. Nodes directly below each other imply
3865 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3866 * 3 doesn't have a quick check so we have to call the slow check.
3867 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3868 * regexp continuation is generated directly after the Sn node, up to the
3869 * next GoTo if we decide to reuse some already generated code. Some
3870 * nodes expect preload_characters to be preloaded into the current
3871 * character register. R nodes do this preloading. Vertices are marked
3872 * F for failures and S for success (possible success in the case of quick
3873 * nodes). L, V, < and > are used as arrow heads.
3874 *
3875 * ----------> R
3876 * |
3877 * V
3878 * Q1 -----> S1
3879 * | S /
3880 * F| /
3881 * | F/
3882 * | /
3883 * | R
3884 * | /
3885 * V L
3886 * Q2 -----> S2
3887 * | S /
3888 * F| /
3889 * | F/
3890 * | /
3891 * | R
3892 * | /
3893 * V L
3894 * S3
3895 * |
3896 * F|
3897 * |
3898 * R
3899 * |
3900 * backtrack V
3901 * <----------Q4
3902 * \ F |
3903 * \ |S
3904 * \ F V
3905 * \-----S4
3906 *
3907 * For greedy loops we push the current position, then generate the code that
3908 * eats the input specially in EmitGreedyLoop. The other choice (the
3909 * continuation) is generated by the normal code in EmitChoices, and steps back
3910 * in the input to the starting position when it fails to match. The loop code
3911 * looks like this (U is the unwind code that steps back in the greedy loop).
3912 *
3913 * _____
3914 * / \
3915 * V |
3916 * ----------> S1 |
3917 * /| |
3918 * / |S |
3919 * F/ \_____/
3920 * /
3921 * |<-----
3922 * | \
3923 * V |S
3924 * Q2 ---> U----->backtrack
3925 * | F /
3926 * S| /
3927 * V F /
3928 * S2--/
3929 */
3930
GreedyLoopState(bool not_at_start)3931 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3932 counter_backtrack_trace_.set_backtrack(&label_);
3933 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3934 }
3935
3936
AssertGuardsMentionRegisters(Trace * trace)3937 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3938 #ifdef DEBUG
3939 int choice_count = alternatives_->length();
3940 for (int i = 0; i < choice_count - 1; i++) {
3941 GuardedAlternative alternative = alternatives_->at(i);
3942 ZoneList<Guard*>* guards = alternative.guards();
3943 int guard_count = (guards == NULL) ? 0 : guards->length();
3944 for (int j = 0; j < guard_count; j++) {
3945 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3946 }
3947 }
3948 #endif
3949 }
3950
3951
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3952 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3953 Trace* current_trace,
3954 PreloadState* state) {
3955 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3956 // Save some time by looking at most one machine word ahead.
3957 state->eats_at_least_ =
3958 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3959 current_trace->at_start() == Trace::FALSE_VALUE);
3960 }
3961 state->preload_characters_ =
3962 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3963
3964 state->preload_is_current_ =
3965 (current_trace->characters_preloaded() == state->preload_characters_);
3966 state->preload_has_checked_bounds_ = state->preload_is_current_;
3967 }
3968
3969
Emit(RegExpCompiler * compiler,Trace * trace)3970 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3971 int choice_count = alternatives_->length();
3972
3973 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3974 alternatives_->at(0).node()->Emit(compiler, trace);
3975 return;
3976 }
3977
3978 AssertGuardsMentionRegisters(trace);
3979
3980 LimitResult limit_result = LimitVersions(compiler, trace);
3981 if (limit_result == DONE) return;
3982 DCHECK(limit_result == CONTINUE);
3983
3984 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3985 // other choice nodes we only flush if we are out of code size budget.
3986 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3987 trace->Flush(compiler, this);
3988 return;
3989 }
3990
3991 RecursionCheck rc(compiler);
3992
3993 PreloadState preload;
3994 preload.init();
3995 GreedyLoopState greedy_loop_state(not_at_start());
3996
3997 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3998 AlternativeGenerationList alt_gens(choice_count, zone());
3999
4000 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4001 trace = EmitGreedyLoop(compiler,
4002 trace,
4003 &alt_gens,
4004 &preload,
4005 &greedy_loop_state,
4006 text_length);
4007 } else {
4008 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
4009 // match the traces produced pre-cleanup.
4010 Label second_choice;
4011 compiler->macro_assembler()->Bind(&second_choice);
4012
4013 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4014
4015 EmitChoices(compiler,
4016 &alt_gens,
4017 0,
4018 trace,
4019 &preload);
4020 }
4021
4022 // At this point we need to generate slow checks for the alternatives where
4023 // the quick check was inlined. We can recognize these because the associated
4024 // label was bound.
4025 int new_flush_budget = trace->flush_budget() / choice_count;
4026 for (int i = 0; i < choice_count; i++) {
4027 AlternativeGeneration* alt_gen = alt_gens.at(i);
4028 Trace new_trace(*trace);
4029 // If there are actions to be flushed we have to limit how many times
4030 // they are flushed. Take the budget of the parent trace and distribute
4031 // it fairly amongst the children.
4032 if (new_trace.actions() != NULL) {
4033 new_trace.set_flush_budget(new_flush_budget);
4034 }
4035 bool next_expects_preload =
4036 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4037 EmitOutOfLineContinuation(compiler,
4038 &new_trace,
4039 alternatives_->at(i),
4040 alt_gen,
4041 preload.preload_characters_,
4042 next_expects_preload);
4043 }
4044 }
4045
4046
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)4047 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4048 Trace* trace,
4049 AlternativeGenerationList* alt_gens,
4050 PreloadState* preload,
4051 GreedyLoopState* greedy_loop_state,
4052 int text_length) {
4053 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4054 // Here we have special handling for greedy loops containing only text nodes
4055 // and other simple nodes. These are handled by pushing the current
4056 // position on the stack and then incrementing the current position each
4057 // time around the switch. On backtrack we decrement the current position
4058 // and check it against the pushed value. This avoids pushing backtrack
4059 // information for each iteration of the loop, which could take up a lot of
4060 // space.
4061 DCHECK(trace->stop_node() == NULL);
4062 macro_assembler->PushCurrentPosition();
4063 Label greedy_match_failed;
4064 Trace greedy_match_trace;
4065 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4066 greedy_match_trace.set_backtrack(&greedy_match_failed);
4067 Label loop_label;
4068 macro_assembler->Bind(&loop_label);
4069 greedy_match_trace.set_stop_node(this);
4070 greedy_match_trace.set_loop_label(&loop_label);
4071 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4072 macro_assembler->Bind(&greedy_match_failed);
4073
4074 Label second_choice; // For use in greedy matches.
4075 macro_assembler->Bind(&second_choice);
4076
4077 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4078
4079 EmitChoices(compiler,
4080 alt_gens,
4081 1,
4082 new_trace,
4083 preload);
4084
4085 macro_assembler->Bind(greedy_loop_state->label());
4086 // If we have unwound to the bottom then backtrack.
4087 macro_assembler->CheckGreedyLoop(trace->backtrack());
4088 // Otherwise try the second priority at an earlier position.
4089 macro_assembler->AdvanceCurrentPosition(-text_length);
4090 macro_assembler->GoTo(&second_choice);
4091 return new_trace;
4092 }
4093
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4094 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4095 Trace* trace) {
4096 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4097 if (alternatives_->length() != 2) return eats_at_least;
4098
4099 GuardedAlternative alt1 = alternatives_->at(1);
4100 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4101 return eats_at_least;
4102 }
4103 RegExpNode* eats_anything_node = alt1.node();
4104 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4105 return eats_at_least;
4106 }
4107
4108 // Really we should be creating a new trace when we execute this function,
4109 // but there is no need, because the code it generates cannot backtrack, and
4110 // we always arrive here with a trivial trace (since it's the entry to a
4111 // loop. That also implies that there are no preloaded characters, which is
4112 // good, because it means we won't be violating any assumptions by
4113 // overwriting those characters with new load instructions.
4114 DCHECK(trace->is_trivial());
4115
4116 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4117 Isolate* isolate = macro_assembler->isolate();
4118 // At this point we know that we are at a non-greedy loop that will eat
4119 // any character one at a time. Any non-anchored regexp has such a
4120 // loop prepended to it in order to find where it starts. We look for
4121 // a pattern of the form ...abc... where we can look 6 characters ahead
4122 // and step forwards 3 if the character is not one of abc. Abc need
4123 // not be atoms, they can be any reasonably limited character class or
4124 // small alternation.
4125 BoyerMooreLookahead* bm = bm_info(false);
4126 if (bm == NULL) {
4127 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4128 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4129 kRecursionBudget,
4130 false));
4131 if (eats_at_least >= 1) {
4132 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4133 compiler,
4134 zone());
4135 GuardedAlternative alt0 = alternatives_->at(0);
4136 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4137 }
4138 }
4139 if (bm != NULL) {
4140 bm->EmitSkipInstructions(macro_assembler);
4141 }
4142 return eats_at_least;
4143 }
4144
4145
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4146 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4147 AlternativeGenerationList* alt_gens,
4148 int first_choice,
4149 Trace* trace,
4150 PreloadState* preload) {
4151 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4152 SetUpPreLoad(compiler, trace, preload);
4153
4154 // For now we just call all choices one after the other. The idea ultimately
4155 // is to use the Dispatch table to try only the relevant ones.
4156 int choice_count = alternatives_->length();
4157
4158 int new_flush_budget = trace->flush_budget() / choice_count;
4159
4160 for (int i = first_choice; i < choice_count; i++) {
4161 bool is_last = i == choice_count - 1;
4162 bool fall_through_on_failure = !is_last;
4163 GuardedAlternative alternative = alternatives_->at(i);
4164 AlternativeGeneration* alt_gen = alt_gens->at(i);
4165 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4166 ZoneList<Guard*>* guards = alternative.guards();
4167 int guard_count = (guards == NULL) ? 0 : guards->length();
4168 Trace new_trace(*trace);
4169 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4170 preload->preload_characters_ :
4171 0);
4172 if (preload->preload_has_checked_bounds_) {
4173 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4174 }
4175 new_trace.quick_check_performed()->Clear();
4176 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4177 if (!is_last) {
4178 new_trace.set_backtrack(&alt_gen->after);
4179 }
4180 alt_gen->expects_preload = preload->preload_is_current_;
4181 bool generate_full_check_inline = false;
4182 if (compiler->optimize() &&
4183 try_to_emit_quick_check_for_alternative(i == 0) &&
4184 alternative.node()->EmitQuickCheck(
4185 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4186 &alt_gen->possible_success, &alt_gen->quick_check_details,
4187 fall_through_on_failure)) {
4188 // Quick check was generated for this choice.
4189 preload->preload_is_current_ = true;
4190 preload->preload_has_checked_bounds_ = true;
4191 // If we generated the quick check to fall through on possible success,
4192 // we now need to generate the full check inline.
4193 if (!fall_through_on_failure) {
4194 macro_assembler->Bind(&alt_gen->possible_success);
4195 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4196 new_trace.set_characters_preloaded(preload->preload_characters_);
4197 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4198 generate_full_check_inline = true;
4199 }
4200 } else if (alt_gen->quick_check_details.cannot_match()) {
4201 if (!fall_through_on_failure) {
4202 macro_assembler->GoTo(trace->backtrack());
4203 }
4204 continue;
4205 } else {
4206 // No quick check was generated. Put the full code here.
4207 // If this is not the first choice then there could be slow checks from
4208 // previous cases that go here when they fail. There's no reason to
4209 // insist that they preload characters since the slow check we are about
4210 // to generate probably can't use it.
4211 if (i != first_choice) {
4212 alt_gen->expects_preload = false;
4213 new_trace.InvalidateCurrentCharacter();
4214 }
4215 generate_full_check_inline = true;
4216 }
4217 if (generate_full_check_inline) {
4218 if (new_trace.actions() != NULL) {
4219 new_trace.set_flush_budget(new_flush_budget);
4220 }
4221 for (int j = 0; j < guard_count; j++) {
4222 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4223 }
4224 alternative.node()->Emit(compiler, &new_trace);
4225 preload->preload_is_current_ = false;
4226 }
4227 macro_assembler->Bind(&alt_gen->after);
4228 }
4229 }
4230
4231
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4232 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4233 Trace* trace,
4234 GuardedAlternative alternative,
4235 AlternativeGeneration* alt_gen,
4236 int preload_characters,
4237 bool next_expects_preload) {
4238 if (!alt_gen->possible_success.is_linked()) return;
4239
4240 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4241 macro_assembler->Bind(&alt_gen->possible_success);
4242 Trace out_of_line_trace(*trace);
4243 out_of_line_trace.set_characters_preloaded(preload_characters);
4244 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4245 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4246 ZoneList<Guard*>* guards = alternative.guards();
4247 int guard_count = (guards == NULL) ? 0 : guards->length();
4248 if (next_expects_preload) {
4249 Label reload_current_char;
4250 out_of_line_trace.set_backtrack(&reload_current_char);
4251 for (int j = 0; j < guard_count; j++) {
4252 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4253 }
4254 alternative.node()->Emit(compiler, &out_of_line_trace);
4255 macro_assembler->Bind(&reload_current_char);
4256 // Reload the current character, since the next quick check expects that.
4257 // We don't need to check bounds here because we only get into this
4258 // code through a quick check which already did the checked load.
4259 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4260 NULL,
4261 false,
4262 preload_characters);
4263 macro_assembler->GoTo(&(alt_gen->after));
4264 } else {
4265 out_of_line_trace.set_backtrack(&(alt_gen->after));
4266 for (int j = 0; j < guard_count; j++) {
4267 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4268 }
4269 alternative.node()->Emit(compiler, &out_of_line_trace);
4270 }
4271 }
4272
4273
Emit(RegExpCompiler * compiler,Trace * trace)4274 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4275 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4276 LimitResult limit_result = LimitVersions(compiler, trace);
4277 if (limit_result == DONE) return;
4278 DCHECK(limit_result == CONTINUE);
4279
4280 RecursionCheck rc(compiler);
4281
4282 switch (action_type_) {
4283 case STORE_POSITION: {
4284 Trace::DeferredCapture
4285 new_capture(data_.u_position_register.reg,
4286 data_.u_position_register.is_capture,
4287 trace);
4288 Trace new_trace = *trace;
4289 new_trace.add_action(&new_capture);
4290 on_success()->Emit(compiler, &new_trace);
4291 break;
4292 }
4293 case INCREMENT_REGISTER: {
4294 Trace::DeferredIncrementRegister
4295 new_increment(data_.u_increment_register.reg);
4296 Trace new_trace = *trace;
4297 new_trace.add_action(&new_increment);
4298 on_success()->Emit(compiler, &new_trace);
4299 break;
4300 }
4301 case SET_REGISTER: {
4302 Trace::DeferredSetRegister
4303 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4304 Trace new_trace = *trace;
4305 new_trace.add_action(&new_set);
4306 on_success()->Emit(compiler, &new_trace);
4307 break;
4308 }
4309 case CLEAR_CAPTURES: {
4310 Trace::DeferredClearCaptures
4311 new_capture(Interval(data_.u_clear_captures.range_from,
4312 data_.u_clear_captures.range_to));
4313 Trace new_trace = *trace;
4314 new_trace.add_action(&new_capture);
4315 on_success()->Emit(compiler, &new_trace);
4316 break;
4317 }
4318 case BEGIN_SUBMATCH:
4319 if (!trace->is_trivial()) {
4320 trace->Flush(compiler, this);
4321 } else {
4322 assembler->WriteCurrentPositionToRegister(
4323 data_.u_submatch.current_position_register, 0);
4324 assembler->WriteStackPointerToRegister(
4325 data_.u_submatch.stack_pointer_register);
4326 on_success()->Emit(compiler, trace);
4327 }
4328 break;
4329 case EMPTY_MATCH_CHECK: {
4330 int start_pos_reg = data_.u_empty_match_check.start_register;
4331 int stored_pos = 0;
4332 int rep_reg = data_.u_empty_match_check.repetition_register;
4333 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4334 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4335 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4336 // If we know we haven't advanced and there is no minimum we
4337 // can just backtrack immediately.
4338 assembler->GoTo(trace->backtrack());
4339 } else if (know_dist && stored_pos < trace->cp_offset()) {
4340 // If we know we've advanced we can generate the continuation
4341 // immediately.
4342 on_success()->Emit(compiler, trace);
4343 } else if (!trace->is_trivial()) {
4344 trace->Flush(compiler, this);
4345 } else {
4346 Label skip_empty_check;
4347 // If we have a minimum number of repetitions we check the current
4348 // number first and skip the empty check if it's not enough.
4349 if (has_minimum) {
4350 int limit = data_.u_empty_match_check.repetition_limit;
4351 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4352 }
4353 // If the match is empty we bail out, otherwise we fall through
4354 // to the on-success continuation.
4355 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4356 trace->backtrack());
4357 assembler->Bind(&skip_empty_check);
4358 on_success()->Emit(compiler, trace);
4359 }
4360 break;
4361 }
4362 case POSITIVE_SUBMATCH_SUCCESS: {
4363 if (!trace->is_trivial()) {
4364 trace->Flush(compiler, this);
4365 return;
4366 }
4367 assembler->ReadCurrentPositionFromRegister(
4368 data_.u_submatch.current_position_register);
4369 assembler->ReadStackPointerFromRegister(
4370 data_.u_submatch.stack_pointer_register);
4371 int clear_register_count = data_.u_submatch.clear_register_count;
4372 if (clear_register_count == 0) {
4373 on_success()->Emit(compiler, trace);
4374 return;
4375 }
4376 int clear_registers_from = data_.u_submatch.clear_register_from;
4377 Label clear_registers_backtrack;
4378 Trace new_trace = *trace;
4379 new_trace.set_backtrack(&clear_registers_backtrack);
4380 on_success()->Emit(compiler, &new_trace);
4381
4382 assembler->Bind(&clear_registers_backtrack);
4383 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4384 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4385
4386 DCHECK(trace->backtrack() == NULL);
4387 assembler->Backtrack();
4388 return;
4389 }
4390 default:
4391 UNREACHABLE();
4392 }
4393 }
4394
4395
Emit(RegExpCompiler * compiler,Trace * trace)4396 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4397 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4398 if (!trace->is_trivial()) {
4399 trace->Flush(compiler, this);
4400 return;
4401 }
4402
4403 LimitResult limit_result = LimitVersions(compiler, trace);
4404 if (limit_result == DONE) return;
4405 DCHECK(limit_result == CONTINUE);
4406
4407 RecursionCheck rc(compiler);
4408
4409 DCHECK_EQ(start_reg_ + 1, end_reg_);
4410 if (compiler->ignore_case()) {
4411 assembler->CheckNotBackReferenceIgnoreCase(
4412 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
4413 } else {
4414 assembler->CheckNotBackReference(start_reg_, read_backward(),
4415 trace->backtrack());
4416 }
4417 // We are going to advance backward, so we may end up at the start.
4418 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4419
4420 // Check that the back reference does not end inside a surrogate pair.
4421 if (compiler->unicode() && !compiler->one_byte()) {
4422 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4423 }
4424 on_success()->Emit(compiler, trace);
4425 }
4426
4427
4428 // -------------------------------------------------------------------
4429 // Dot/dotty output
4430
4431
4432 #ifdef DEBUG
4433
4434
4435 class DotPrinter: public NodeVisitor {
4436 public:
DotPrinter(std::ostream & os,bool ignore_case)4437 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4438 : os_(os),
4439 ignore_case_(ignore_case) {}
4440 void PrintNode(const char* label, RegExpNode* node);
4441 void Visit(RegExpNode* node);
4442 void PrintAttributes(RegExpNode* from);
4443 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4444 #define DECLARE_VISIT(Type) \
4445 virtual void Visit##Type(Type##Node* that);
4446 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4447 #undef DECLARE_VISIT
4448 private:
4449 std::ostream& os_;
4450 bool ignore_case_;
4451 };
4452
4453
PrintNode(const char * label,RegExpNode * node)4454 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4455 os_ << "digraph G {\n graph [label=\"";
4456 for (int i = 0; label[i]; i++) {
4457 switch (label[i]) {
4458 case '\\':
4459 os_ << "\\\\";
4460 break;
4461 case '"':
4462 os_ << "\"";
4463 break;
4464 default:
4465 os_ << label[i];
4466 break;
4467 }
4468 }
4469 os_ << "\"];\n";
4470 Visit(node);
4471 os_ << "}" << std::endl;
4472 }
4473
4474
Visit(RegExpNode * node)4475 void DotPrinter::Visit(RegExpNode* node) {
4476 if (node->info()->visited) return;
4477 node->info()->visited = true;
4478 node->Accept(this);
4479 }
4480
4481
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4482 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4483 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4484 Visit(on_failure);
4485 }
4486
4487
4488 class TableEntryBodyPrinter {
4489 public:
TableEntryBodyPrinter(std::ostream & os,ChoiceNode * choice)4490 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4491 : os_(os),
4492 choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4493 void Call(uc16 from, DispatchTable::Entry entry) {
4494 OutSet* out_set = entry.out_set();
4495 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4496 if (out_set->Get(i)) {
4497 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4498 << choice()->alternatives()->at(i).node() << ";\n";
4499 }
4500 }
4501 }
4502 private:
choice()4503 ChoiceNode* choice() { return choice_; }
4504 std::ostream& os_;
4505 ChoiceNode* choice_;
4506 };
4507
4508
4509 class TableEntryHeaderPrinter {
4510 public:
TableEntryHeaderPrinter(std::ostream & os)4511 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4512 : first_(true),
4513 os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4514 void Call(uc16 from, DispatchTable::Entry entry) {
4515 if (first_) {
4516 first_ = false;
4517 } else {
4518 os_ << "|";
4519 }
4520 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4521 OutSet* out_set = entry.out_set();
4522 int priority = 0;
4523 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4524 if (out_set->Get(i)) {
4525 if (priority > 0) os_ << "|";
4526 os_ << "<s" << from << "o" << i << "> " << priority;
4527 priority++;
4528 }
4529 }
4530 os_ << "}}";
4531 }
4532
4533 private:
4534 bool first_;
4535 std::ostream& os_;
4536 };
4537
4538
4539 class AttributePrinter {
4540 public:
AttributePrinter(std::ostream & os)4541 explicit AttributePrinter(std::ostream& os) // NOLINT
4542 : os_(os),
4543 first_(true) {}
PrintSeparator()4544 void PrintSeparator() {
4545 if (first_) {
4546 first_ = false;
4547 } else {
4548 os_ << "|";
4549 }
4550 }
PrintBit(const char * name,bool value)4551 void PrintBit(const char* name, bool value) {
4552 if (!value) return;
4553 PrintSeparator();
4554 os_ << "{" << name << "}";
4555 }
PrintPositive(const char * name,int value)4556 void PrintPositive(const char* name, int value) {
4557 if (value < 0) return;
4558 PrintSeparator();
4559 os_ << "{" << name << "|" << value << "}";
4560 }
4561
4562 private:
4563 std::ostream& os_;
4564 bool first_;
4565 };
4566
4567
PrintAttributes(RegExpNode * that)4568 void DotPrinter::PrintAttributes(RegExpNode* that) {
4569 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4570 << "margin=0.1, fontsize=10, label=\"{";
4571 AttributePrinter printer(os_);
4572 NodeInfo* info = that->info();
4573 printer.PrintBit("NI", info->follows_newline_interest);
4574 printer.PrintBit("WI", info->follows_word_interest);
4575 printer.PrintBit("SI", info->follows_start_interest);
4576 Label* label = that->label();
4577 if (label->is_bound())
4578 printer.PrintPositive("@", label->pos());
4579 os_ << "}\"];\n"
4580 << " a" << that << " -> n" << that
4581 << " [style=dashed, color=grey, arrowhead=none];\n";
4582 }
4583
4584
4585 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4586 void DotPrinter::VisitChoice(ChoiceNode* that) {
4587 if (kPrintDispatchTable) {
4588 os_ << " n" << that << " [shape=Mrecord, label=\"";
4589 TableEntryHeaderPrinter header_printer(os_);
4590 that->GetTable(ignore_case_)->ForEach(&header_printer);
4591 os_ << "\"]\n";
4592 PrintAttributes(that);
4593 TableEntryBodyPrinter body_printer(os_, that);
4594 that->GetTable(ignore_case_)->ForEach(&body_printer);
4595 } else {
4596 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4597 for (int i = 0; i < that->alternatives()->length(); i++) {
4598 GuardedAlternative alt = that->alternatives()->at(i);
4599 os_ << " n" << that << " -> n" << alt.node();
4600 }
4601 }
4602 for (int i = 0; i < that->alternatives()->length(); i++) {
4603 GuardedAlternative alt = that->alternatives()->at(i);
4604 alt.node()->Accept(this);
4605 }
4606 }
4607
4608
VisitText(TextNode * that)4609 void DotPrinter::VisitText(TextNode* that) {
4610 Zone* zone = that->zone();
4611 os_ << " n" << that << " [label=\"";
4612 for (int i = 0; i < that->elements()->length(); i++) {
4613 if (i > 0) os_ << " ";
4614 TextElement elm = that->elements()->at(i);
4615 switch (elm.text_type()) {
4616 case TextElement::ATOM: {
4617 Vector<const uc16> data = elm.atom()->data();
4618 for (int i = 0; i < data.length(); i++) {
4619 os_ << static_cast<char>(data[i]);
4620 }
4621 break;
4622 }
4623 case TextElement::CHAR_CLASS: {
4624 RegExpCharacterClass* node = elm.char_class();
4625 os_ << "[";
4626 if (node->is_negated()) os_ << "^";
4627 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4628 CharacterRange range = node->ranges(zone)->at(j);
4629 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4630 }
4631 os_ << "]";
4632 break;
4633 }
4634 default:
4635 UNREACHABLE();
4636 }
4637 }
4638 os_ << "\", shape=box, peripheries=2];\n";
4639 PrintAttributes(that);
4640 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4641 Visit(that->on_success());
4642 }
4643
4644
VisitBackReference(BackReferenceNode * that)4645 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4646 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4647 << that->end_register() << "\", shape=doubleoctagon];\n";
4648 PrintAttributes(that);
4649 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4650 Visit(that->on_success());
4651 }
4652
4653
VisitEnd(EndNode * that)4654 void DotPrinter::VisitEnd(EndNode* that) {
4655 os_ << " n" << that << " [style=bold, shape=point];\n";
4656 PrintAttributes(that);
4657 }
4658
4659
VisitAssertion(AssertionNode * that)4660 void DotPrinter::VisitAssertion(AssertionNode* that) {
4661 os_ << " n" << that << " [";
4662 switch (that->assertion_type()) {
4663 case AssertionNode::AT_END:
4664 os_ << "label=\"$\", shape=septagon";
4665 break;
4666 case AssertionNode::AT_START:
4667 os_ << "label=\"^\", shape=septagon";
4668 break;
4669 case AssertionNode::AT_BOUNDARY:
4670 os_ << "label=\"\\b\", shape=septagon";
4671 break;
4672 case AssertionNode::AT_NON_BOUNDARY:
4673 os_ << "label=\"\\B\", shape=septagon";
4674 break;
4675 case AssertionNode::AFTER_NEWLINE:
4676 os_ << "label=\"(?<=\\n)\", shape=septagon";
4677 break;
4678 }
4679 os_ << "];\n";
4680 PrintAttributes(that);
4681 RegExpNode* successor = that->on_success();
4682 os_ << " n" << that << " -> n" << successor << ";\n";
4683 Visit(successor);
4684 }
4685
4686
VisitAction(ActionNode * that)4687 void DotPrinter::VisitAction(ActionNode* that) {
4688 os_ << " n" << that << " [";
4689 switch (that->action_type_) {
4690 case ActionNode::SET_REGISTER:
4691 os_ << "label=\"$" << that->data_.u_store_register.reg
4692 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4693 break;
4694 case ActionNode::INCREMENT_REGISTER:
4695 os_ << "label=\"$" << that->data_.u_increment_register.reg
4696 << "++\", shape=octagon";
4697 break;
4698 case ActionNode::STORE_POSITION:
4699 os_ << "label=\"$" << that->data_.u_position_register.reg
4700 << ":=$pos\", shape=octagon";
4701 break;
4702 case ActionNode::BEGIN_SUBMATCH:
4703 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4704 << ":=$pos,begin\", shape=septagon";
4705 break;
4706 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4707 os_ << "label=\"escape\", shape=septagon";
4708 break;
4709 case ActionNode::EMPTY_MATCH_CHECK:
4710 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4711 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4712 << "<" << that->data_.u_empty_match_check.repetition_limit
4713 << "?\", shape=septagon";
4714 break;
4715 case ActionNode::CLEAR_CAPTURES: {
4716 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4717 << " to $" << that->data_.u_clear_captures.range_to
4718 << "\", shape=septagon";
4719 break;
4720 }
4721 }
4722 os_ << "];\n";
4723 PrintAttributes(that);
4724 RegExpNode* successor = that->on_success();
4725 os_ << " n" << that << " -> n" << successor << ";\n";
4726 Visit(successor);
4727 }
4728
4729
4730 class DispatchTableDumper {
4731 public:
DispatchTableDumper(std::ostream & os)4732 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4733 void Call(uc16 key, DispatchTable::Entry entry);
4734 private:
4735 std::ostream& os_;
4736 };
4737
4738
Call(uc16 key,DispatchTable::Entry entry)4739 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4740 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4741 OutSet* set = entry.out_set();
4742 bool first = true;
4743 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4744 if (set->Get(i)) {
4745 if (first) {
4746 first = false;
4747 } else {
4748 os_ << ", ";
4749 }
4750 os_ << i;
4751 }
4752 }
4753 os_ << "}\n";
4754 }
4755
4756
Dump()4757 void DispatchTable::Dump() {
4758 OFStream os(stderr);
4759 DispatchTableDumper dumper(os);
4760 tree()->ForEach(&dumper);
4761 }
4762
4763
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4764 void RegExpEngine::DotPrint(const char* label,
4765 RegExpNode* node,
4766 bool ignore_case) {
4767 OFStream os(stdout);
4768 DotPrinter printer(os, ignore_case);
4769 printer.PrintNode(label, node);
4770 }
4771
4772
4773 #endif // DEBUG
4774
4775
4776 // -------------------------------------------------------------------
4777 // Tree to graph conversion
4778
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4779 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4780 RegExpNode* on_success) {
4781 ZoneList<TextElement>* elms =
4782 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4783 elms->Add(TextElement::Atom(this), compiler->zone());
4784 return new (compiler->zone())
4785 TextNode(elms, compiler->read_backward(), on_success);
4786 }
4787
4788
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4789 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4790 RegExpNode* on_success) {
4791 return new (compiler->zone())
4792 TextNode(elements(), compiler->read_backward(), on_success);
4793 }
4794
4795
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4796 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4797 const int* special_class,
4798 int length) {
4799 length--; // Remove final marker.
4800 DCHECK(special_class[length] == kRangeEndMarker);
4801 DCHECK(ranges->length() != 0);
4802 DCHECK(length != 0);
4803 DCHECK(special_class[0] != 0);
4804 if (ranges->length() != (length >> 1) + 1) {
4805 return false;
4806 }
4807 CharacterRange range = ranges->at(0);
4808 if (range.from() != 0) {
4809 return false;
4810 }
4811 for (int i = 0; i < length; i += 2) {
4812 if (special_class[i] != (range.to() + 1)) {
4813 return false;
4814 }
4815 range = ranges->at((i >> 1) + 1);
4816 if (special_class[i+1] != range.from()) {
4817 return false;
4818 }
4819 }
4820 if (range.to() != String::kMaxCodePoint) {
4821 return false;
4822 }
4823 return true;
4824 }
4825
4826
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4827 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4828 const int* special_class,
4829 int length) {
4830 length--; // Remove final marker.
4831 DCHECK(special_class[length] == kRangeEndMarker);
4832 if (ranges->length() * 2 != length) {
4833 return false;
4834 }
4835 for (int i = 0; i < length; i += 2) {
4836 CharacterRange range = ranges->at(i >> 1);
4837 if (range.from() != special_class[i] ||
4838 range.to() != special_class[i + 1] - 1) {
4839 return false;
4840 }
4841 }
4842 return true;
4843 }
4844
4845
is_standard(Zone * zone)4846 bool RegExpCharacterClass::is_standard(Zone* zone) {
4847 // TODO(lrn): Remove need for this function, by not throwing away information
4848 // along the way.
4849 if (is_negated_) {
4850 return false;
4851 }
4852 if (set_.is_standard()) {
4853 return true;
4854 }
4855 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4856 set_.set_standard_set_type('s');
4857 return true;
4858 }
4859 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4860 set_.set_standard_set_type('S');
4861 return true;
4862 }
4863 if (CompareInverseRanges(set_.ranges(zone),
4864 kLineTerminatorRanges,
4865 kLineTerminatorRangeCount)) {
4866 set_.set_standard_set_type('.');
4867 return true;
4868 }
4869 if (CompareRanges(set_.ranges(zone),
4870 kLineTerminatorRanges,
4871 kLineTerminatorRangeCount)) {
4872 set_.set_standard_set_type('n');
4873 return true;
4874 }
4875 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4876 set_.set_standard_set_type('w');
4877 return true;
4878 }
4879 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4880 set_.set_standard_set_type('W');
4881 return true;
4882 }
4883 return false;
4884 }
4885
4886
UnicodeRangeSplitter(Zone * zone,ZoneList<CharacterRange> * base)4887 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4888 ZoneList<CharacterRange>* base)
4889 : zone_(zone),
4890 table_(zone),
4891 bmp_(nullptr),
4892 lead_surrogates_(nullptr),
4893 trail_surrogates_(nullptr),
4894 non_bmp_(nullptr) {
4895 // The unicode range splitter categorizes given character ranges into:
4896 // - Code points from the BMP representable by one code unit.
4897 // - Code points outside the BMP that need to be split into surrogate pairs.
4898 // - Lone lead surrogates.
4899 // - Lone trail surrogates.
4900 // Lone surrogates are valid code points, even though no actual characters.
4901 // They require special matching to make sure we do not split surrogate pairs.
4902 // We use the dispatch table to accomplish this. The base range is split up
4903 // by the table by the overlay ranges, and the Call callback is used to
4904 // filter and collect ranges for each category.
4905 for (int i = 0; i < base->length(); i++) {
4906 table_.AddRange(base->at(i), kBase, zone_);
4907 }
4908 // Add overlay ranges.
4909 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4910 kBmpCodePoints, zone_);
4911 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4912 kLeadSurrogates, zone_);
4913 table_.AddRange(
4914 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4915 kTrailSurrogates, zone_);
4916 table_.AddRange(
4917 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4918 kBmpCodePoints, zone_);
4919 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4920 kNonBmpCodePoints, zone_);
4921 table_.ForEach(this);
4922 }
4923
4924
Call(uc32 from,DispatchTable::Entry entry)4925 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4926 OutSet* outset = entry.out_set();
4927 if (!outset->Get(kBase)) return;
4928 ZoneList<CharacterRange>** target = NULL;
4929 if (outset->Get(kBmpCodePoints)) {
4930 target = &bmp_;
4931 } else if (outset->Get(kLeadSurrogates)) {
4932 target = &lead_surrogates_;
4933 } else if (outset->Get(kTrailSurrogates)) {
4934 target = &trail_surrogates_;
4935 } else {
4936 DCHECK(outset->Get(kNonBmpCodePoints));
4937 target = &non_bmp_;
4938 }
4939 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4940 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4941 }
4942
4943
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4944 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4945 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4946 ZoneList<CharacterRange>* bmp = splitter->bmp();
4947 if (bmp == nullptr) return;
4948 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4949 compiler->zone(), bmp, compiler->read_backward(), on_success)));
4950 }
4951
4952
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4953 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4954 RegExpNode* on_success,
4955 UnicodeRangeSplitter* splitter) {
4956 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4957 if (non_bmp == nullptr) return;
4958 DCHECK(compiler->unicode());
4959 DCHECK(!compiler->one_byte());
4960 Zone* zone = compiler->zone();
4961 CharacterRange::Canonicalize(non_bmp);
4962 for (int i = 0; i < non_bmp->length(); i++) {
4963 // Match surrogate pair.
4964 // E.g. [\u10005-\u11005] becomes
4965 // \ud800[\udc05-\udfff]|
4966 // [\ud801-\ud803][\udc00-\udfff]|
4967 // \ud804[\udc00-\udc05]
4968 uc32 from = non_bmp->at(i).from();
4969 uc32 to = non_bmp->at(i).to();
4970 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4971 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4972 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4973 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4974 if (from_l == to_l) {
4975 // The lead surrogate is the same.
4976 result->AddAlternative(
4977 GuardedAlternative(TextNode::CreateForSurrogatePair(
4978 zone, CharacterRange::Singleton(from_l),
4979 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4980 on_success)));
4981 } else {
4982 if (from_t != kTrailSurrogateStart) {
4983 // Add [from_l][from_t-\udfff]
4984 result->AddAlternative(
4985 GuardedAlternative(TextNode::CreateForSurrogatePair(
4986 zone, CharacterRange::Singleton(from_l),
4987 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4988 compiler->read_backward(), on_success)));
4989 from_l++;
4990 }
4991 if (to_t != kTrailSurrogateEnd) {
4992 // Add [to_l][\udc00-to_t]
4993 result->AddAlternative(
4994 GuardedAlternative(TextNode::CreateForSurrogatePair(
4995 zone, CharacterRange::Singleton(to_l),
4996 CharacterRange::Range(kTrailSurrogateStart, to_t),
4997 compiler->read_backward(), on_success)));
4998 to_l--;
4999 }
5000 if (from_l <= to_l) {
5001 // Add [from_l-to_l][\udc00-\udfff]
5002 result->AddAlternative(
5003 GuardedAlternative(TextNode::CreateForSurrogatePair(
5004 zone, CharacterRange::Range(from_l, to_l),
5005 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5006 compiler->read_backward(), on_success)));
5007 }
5008 }
5009 }
5010 }
5011
5012
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward)5013 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5014 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5015 ZoneList<CharacterRange>* match, RegExpNode* on_success,
5016 bool read_backward) {
5017 Zone* zone = compiler->zone();
5018 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5019 zone, match, read_backward, on_success);
5020 int stack_register = compiler->UnicodeLookaroundStackRegister();
5021 int position_register = compiler->UnicodeLookaroundPositionRegister();
5022 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5023 position_register);
5024 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5025 zone, lookbehind, !read_backward, lookaround.on_match_success());
5026 return lookaround.ForMatch(negative_match);
5027 }
5028
5029
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward)5030 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5031 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5032 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5033 bool read_backward) {
5034 Zone* zone = compiler->zone();
5035 int stack_register = compiler->UnicodeLookaroundStackRegister();
5036 int position_register = compiler->UnicodeLookaroundPositionRegister();
5037 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5038 position_register);
5039 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5040 zone, lookahead, read_backward, lookaround.on_match_success());
5041 return TextNode::CreateForCharacterRanges(
5042 zone, match, read_backward, lookaround.ForMatch(negative_match));
5043 }
5044
5045
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5046 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5047 RegExpNode* on_success,
5048 UnicodeRangeSplitter* splitter) {
5049 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5050 if (lead_surrogates == nullptr) return;
5051 Zone* zone = compiler->zone();
5052 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5053 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5054 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5055
5056 RegExpNode* match;
5057 if (compiler->read_backward()) {
5058 // Reading backward. Assert that reading forward, there is no trail
5059 // surrogate, and then backward match the lead surrogate.
5060 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5061 compiler, trail_surrogates, lead_surrogates, on_success, true);
5062 } else {
5063 // Reading forward. Forward match the lead surrogate and assert that
5064 // no trail surrogate follows.
5065 match = MatchAndNegativeLookaroundInReadDirection(
5066 compiler, lead_surrogates, trail_surrogates, on_success, false);
5067 }
5068 result->AddAlternative(GuardedAlternative(match));
5069 }
5070
5071
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5072 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5073 RegExpNode* on_success,
5074 UnicodeRangeSplitter* splitter) {
5075 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5076 if (trail_surrogates == nullptr) return;
5077 Zone* zone = compiler->zone();
5078 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5079 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5080 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5081
5082 RegExpNode* match;
5083 if (compiler->read_backward()) {
5084 // Reading backward. Backward match the trail surrogate and assert that no
5085 // lead surrogate precedes it.
5086 match = MatchAndNegativeLookaroundInReadDirection(
5087 compiler, trail_surrogates, lead_surrogates, on_success, true);
5088 } else {
5089 // Reading forward. Assert that reading backward, there is no lead
5090 // surrogate, and then forward match the trail surrogate.
5091 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5092 compiler, lead_surrogates, trail_surrogates, on_success, false);
5093 }
5094 result->AddAlternative(GuardedAlternative(match));
5095 }
5096
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)5097 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5098 RegExpNode* on_success) {
5099 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5100 DCHECK(!compiler->read_backward());
5101 Zone* zone = compiler->zone();
5102 // Advance any character. If the character happens to be a lead surrogate and
5103 // we advanced into the middle of a surrogate pair, it will work out, as
5104 // nothing will match from there. We will have to advance again, consuming
5105 // the associated trail surrogate.
5106 ZoneList<CharacterRange>* range = CharacterRange::List(
5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5109 }
5110
5111
AddUnicodeCaseEquivalents(RegExpCompiler * compiler,ZoneList<CharacterRange> * ranges)5112 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5113 ZoneList<CharacterRange>* ranges) {
5114 #ifdef V8_I18N_SUPPORT
5115 // Use ICU to compute the case fold closure over the ranges.
5116 DCHECK(compiler->unicode());
5117 DCHECK(compiler->ignore_case());
5118 icu::UnicodeSet set;
5119 for (int i = 0; i < ranges->length(); i++) {
5120 set.add(ranges->at(i).from(), ranges->at(i).to());
5121 }
5122 ranges->Clear();
5123 set.closeOver(USET_CASE_INSENSITIVE);
5124 // Full case mapping map single characters to multiple characters.
5125 // Those are represented as strings in the set. Remove them so that
5126 // we end up with only simple and common case mappings.
5127 set.removeAllStrings();
5128 Zone* zone = compiler->zone();
5129 for (int i = 0; i < set.getRangeCount(); i++) {
5130 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5131 zone);
5132 }
5133 // No errors and everything we collected have been ranges.
5134 #else
5135 // Fallback if ICU is not included.
5136 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5137 ranges, compiler->one_byte());
5138 #endif // V8_I18N_SUPPORT
5139 CharacterRange::Canonicalize(ranges);
5140 }
5141
5142
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5143 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5144 RegExpNode* on_success) {
5145 set_.Canonicalize();
5146 Zone* zone = compiler->zone();
5147 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5148 if (compiler->unicode() && compiler->ignore_case()) {
5149 AddUnicodeCaseEquivalents(compiler, ranges);
5150 }
5151 if (compiler->unicode() && !compiler->one_byte()) {
5152 if (is_negated()) {
5153 ZoneList<CharacterRange>* negated =
5154 new (zone) ZoneList<CharacterRange>(2, zone);
5155 CharacterRange::Negate(ranges, negated, zone);
5156 ranges = negated;
5157 }
5158 if (ranges->length() == 0) {
5159 ranges->Add(CharacterRange::Everything(), zone);
5160 RegExpCharacterClass* fail =
5161 new (zone) RegExpCharacterClass(ranges, true);
5162 return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5163 }
5164 if (standard_type() == '*') {
5165 return UnanchoredAdvance(compiler, on_success);
5166 } else {
5167 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5168 UnicodeRangeSplitter splitter(zone, ranges);
5169 AddBmpCharacters(compiler, result, on_success, &splitter);
5170 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5171 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5172 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5173 return result;
5174 }
5175 } else {
5176 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5177 }
5178 }
5179
5180
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)5181 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5182 RegExpAtom* atom1 = (*a)->AsAtom();
5183 RegExpAtom* atom2 = (*b)->AsAtom();
5184 uc16 character1 = atom1->data().at(0);
5185 uc16 character2 = atom2->data().at(0);
5186 if (character1 < character2) return -1;
5187 if (character1 > character2) return 1;
5188 return 0;
5189 }
5190
5191
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)5192 static unibrow::uchar Canonical(
5193 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5194 unibrow::uchar c) {
5195 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5196 int length = canonicalize->get(c, '\0', chars);
5197 DCHECK_LE(length, 1);
5198 unibrow::uchar canonical = c;
5199 if (length == 1) canonical = chars[0];
5200 return canonical;
5201 }
5202
5203
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)5204 int CompareFirstCharCaseIndependent(
5205 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5206 RegExpTree* const* a, RegExpTree* const* b) {
5207 RegExpAtom* atom1 = (*a)->AsAtom();
5208 RegExpAtom* atom2 = (*b)->AsAtom();
5209 unibrow::uchar character1 = atom1->data().at(0);
5210 unibrow::uchar character2 = atom2->data().at(0);
5211 if (character1 == character2) return 0;
5212 if (character1 >= 'a' || character2 >= 'a') {
5213 character1 = Canonical(canonicalize, character1);
5214 character2 = Canonical(canonicalize, character2);
5215 }
5216 return static_cast<int>(character1) - static_cast<int>(character2);
5217 }
5218
5219
5220 // We can stable sort runs of atoms, since the order does not matter if they
5221 // start with different characters.
5222 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)5223 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5224 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5225 int length = alternatives->length();
5226 bool found_consecutive_atoms = false;
5227 for (int i = 0; i < length; i++) {
5228 while (i < length) {
5229 RegExpTree* alternative = alternatives->at(i);
5230 if (alternative->IsAtom()) break;
5231 i++;
5232 }
5233 // i is length or it is the index of an atom.
5234 if (i == length) break;
5235 int first_atom = i;
5236 i++;
5237 while (i < length) {
5238 RegExpTree* alternative = alternatives->at(i);
5239 if (!alternative->IsAtom()) break;
5240 i++;
5241 }
5242 // Sort atoms to get ones with common prefixes together.
5243 // This step is more tricky if we are in a case-independent regexp,
5244 // because it would change /is|I/ to /I|is/, and order matters when
5245 // the regexp parts don't match only disjoint starting points. To fix
5246 // this we have a version of CompareFirstChar that uses case-
5247 // independent character classes for comparison.
5248 DCHECK_LT(first_atom, alternatives->length());
5249 DCHECK_LE(i, alternatives->length());
5250 DCHECK_LE(first_atom, i);
5251 if (compiler->ignore_case()) {
5252 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5253 compiler->isolate()->regexp_macro_assembler_canonicalize();
5254 auto compare_closure =
5255 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5256 return CompareFirstCharCaseIndependent(canonicalize, a, b);
5257 };
5258 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5259 } else {
5260 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5261 }
5262 if (i - first_atom > 1) found_consecutive_atoms = true;
5263 }
5264 return found_consecutive_atoms;
5265 }
5266
5267
5268 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)5269 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5270 Zone* zone = compiler->zone();
5271 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5272 int length = alternatives->length();
5273
5274 int write_posn = 0;
5275 int i = 0;
5276 while (i < length) {
5277 RegExpTree* alternative = alternatives->at(i);
5278 if (!alternative->IsAtom()) {
5279 alternatives->at(write_posn++) = alternatives->at(i);
5280 i++;
5281 continue;
5282 }
5283 RegExpAtom* atom = alternative->AsAtom();
5284 unibrow::uchar common_prefix = atom->data().at(0);
5285 int first_with_prefix = i;
5286 int prefix_length = atom->length();
5287 i++;
5288 while (i < length) {
5289 alternative = alternatives->at(i);
5290 if (!alternative->IsAtom()) break;
5291 atom = alternative->AsAtom();
5292 unibrow::uchar new_prefix = atom->data().at(0);
5293 if (new_prefix != common_prefix) {
5294 if (!compiler->ignore_case()) break;
5295 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5296 compiler->isolate()->regexp_macro_assembler_canonicalize();
5297 new_prefix = Canonical(canonicalize, new_prefix);
5298 common_prefix = Canonical(canonicalize, common_prefix);
5299 if (new_prefix != common_prefix) break;
5300 }
5301 prefix_length = Min(prefix_length, atom->length());
5302 i++;
5303 }
5304 if (i > first_with_prefix + 2) {
5305 // Found worthwhile run of alternatives with common prefix of at least one
5306 // character. The sorting function above did not sort on more than one
5307 // character for reasons of correctness, but there may still be a longer
5308 // common prefix if the terms were similar or presorted in the input.
5309 // Find out how long the common prefix is.
5310 int run_length = i - first_with_prefix;
5311 atom = alternatives->at(first_with_prefix)->AsAtom();
5312 for (int j = 1; j < run_length && prefix_length > 1; j++) {
5313 RegExpAtom* old_atom =
5314 alternatives->at(j + first_with_prefix)->AsAtom();
5315 for (int k = 1; k < prefix_length; k++) {
5316 if (atom->data().at(k) != old_atom->data().at(k)) {
5317 prefix_length = k;
5318 break;
5319 }
5320 }
5321 }
5322 RegExpAtom* prefix =
5323 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5324 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5325 pair->Add(prefix, zone);
5326 ZoneList<RegExpTree*>* suffixes =
5327 new (zone) ZoneList<RegExpTree*>(run_length, zone);
5328 for (int j = 0; j < run_length; j++) {
5329 RegExpAtom* old_atom =
5330 alternatives->at(j + first_with_prefix)->AsAtom();
5331 int len = old_atom->length();
5332 if (len == prefix_length) {
5333 suffixes->Add(new (zone) RegExpEmpty(), zone);
5334 } else {
5335 RegExpTree* suffix = new (zone) RegExpAtom(
5336 old_atom->data().SubVector(prefix_length, old_atom->length()));
5337 suffixes->Add(suffix, zone);
5338 }
5339 }
5340 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5341 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5342 } else {
5343 // Just copy any non-worthwhile alternatives.
5344 for (int j = first_with_prefix; j < i; j++) {
5345 alternatives->at(write_posn++) = alternatives->at(j);
5346 }
5347 }
5348 }
5349 alternatives->Rewind(write_posn); // Trim end of array.
5350 }
5351
5352
5353 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)5354 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5355 RegExpCompiler* compiler) {
5356 Zone* zone = compiler->zone();
5357 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5358 int length = alternatives->length();
5359
5360 int write_posn = 0;
5361 int i = 0;
5362 while (i < length) {
5363 RegExpTree* alternative = alternatives->at(i);
5364 if (!alternative->IsAtom()) {
5365 alternatives->at(write_posn++) = alternatives->at(i);
5366 i++;
5367 continue;
5368 }
5369 RegExpAtom* atom = alternative->AsAtom();
5370 if (atom->length() != 1) {
5371 alternatives->at(write_posn++) = alternatives->at(i);
5372 i++;
5373 continue;
5374 }
5375 int first_in_run = i;
5376 i++;
5377 while (i < length) {
5378 alternative = alternatives->at(i);
5379 if (!alternative->IsAtom()) break;
5380 atom = alternative->AsAtom();
5381 if (atom->length() != 1) break;
5382 i++;
5383 }
5384 if (i > first_in_run + 1) {
5385 // Found non-trivial run of single-character alternatives.
5386 int run_length = i - first_in_run;
5387 ZoneList<CharacterRange>* ranges =
5388 new (zone) ZoneList<CharacterRange>(2, zone);
5389 for (int j = 0; j < run_length; j++) {
5390 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5391 DCHECK_EQ(old_atom->length(), 1);
5392 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5393 }
5394 alternatives->at(write_posn++) =
5395 new (zone) RegExpCharacterClass(ranges, false);
5396 } else {
5397 // Just copy any trivial alternatives.
5398 for (int j = first_in_run; j < i; j++) {
5399 alternatives->at(write_posn++) = alternatives->at(j);
5400 }
5401 }
5402 }
5403 alternatives->Rewind(write_posn); // Trim end of array.
5404 }
5405
5406
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5407 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5408 RegExpNode* on_success) {
5409 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5410
5411 if (alternatives->length() > 2) {
5412 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5413 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5414 FixSingleCharacterDisjunctions(compiler);
5415 if (alternatives->length() == 1) {
5416 return alternatives->at(0)->ToNode(compiler, on_success);
5417 }
5418 }
5419
5420 int length = alternatives->length();
5421
5422 ChoiceNode* result =
5423 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5424 for (int i = 0; i < length; i++) {
5425 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5426 on_success));
5427 result->AddAlternative(alternative);
5428 }
5429 return result;
5430 }
5431
5432
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5433 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5434 RegExpNode* on_success) {
5435 return ToNode(min(),
5436 max(),
5437 is_greedy(),
5438 body(),
5439 compiler,
5440 on_success);
5441 }
5442
5443
5444 // Scoped object to keep track of how much we unroll quantifier loops in the
5445 // regexp graph generator.
5446 class RegExpExpansionLimiter {
5447 public:
5448 static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)5449 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5450 : compiler_(compiler),
5451 saved_expansion_factor_(compiler->current_expansion_factor()),
5452 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5453 DCHECK(factor > 0);
5454 if (ok_to_expand_) {
5455 if (factor > kMaxExpansionFactor) {
5456 // Avoid integer overflow of the current expansion factor.
5457 ok_to_expand_ = false;
5458 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5459 } else {
5460 int new_factor = saved_expansion_factor_ * factor;
5461 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5462 compiler->set_current_expansion_factor(new_factor);
5463 }
5464 }
5465 }
5466
~RegExpExpansionLimiter()5467 ~RegExpExpansionLimiter() {
5468 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5469 }
5470
ok_to_expand()5471 bool ok_to_expand() { return ok_to_expand_; }
5472
5473 private:
5474 RegExpCompiler* compiler_;
5475 int saved_expansion_factor_;
5476 bool ok_to_expand_;
5477
5478 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5479 };
5480
5481
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)5482 RegExpNode* RegExpQuantifier::ToNode(int min,
5483 int max,
5484 bool is_greedy,
5485 RegExpTree* body,
5486 RegExpCompiler* compiler,
5487 RegExpNode* on_success,
5488 bool not_at_start) {
5489 // x{f, t} becomes this:
5490 //
5491 // (r++)<-.
5492 // | `
5493 // | (x)
5494 // v ^
5495 // (r=0)-->(?)---/ [if r < t]
5496 // |
5497 // [if r >= f] \----> ...
5498 //
5499
5500 // 15.10.2.5 RepeatMatcher algorithm.
5501 // The parser has already eliminated the case where max is 0. In the case
5502 // where max_match is zero the parser has removed the quantifier if min was
5503 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5504
5505 // If we know that we cannot match zero length then things are a little
5506 // simpler since we don't need to make the special zero length match check
5507 // from step 2.1. If the min and max are small we can unroll a little in
5508 // this case.
5509 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5510 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5511 if (max == 0) return on_success; // This can happen due to recursion.
5512 bool body_can_be_empty = (body->min_match() == 0);
5513 int body_start_reg = RegExpCompiler::kNoRegister;
5514 Interval capture_registers = body->CaptureRegisters();
5515 bool needs_capture_clearing = !capture_registers.is_empty();
5516 Zone* zone = compiler->zone();
5517
5518 if (body_can_be_empty) {
5519 body_start_reg = compiler->AllocateRegister();
5520 } else if (compiler->optimize() && !needs_capture_clearing) {
5521 // Only unroll if there are no captures and the body can't be
5522 // empty.
5523 {
5524 RegExpExpansionLimiter limiter(
5525 compiler, min + ((max != min) ? 1 : 0));
5526 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5527 int new_max = (max == kInfinity) ? max : max - min;
5528 // Recurse once to get the loop or optional matches after the fixed
5529 // ones.
5530 RegExpNode* answer = ToNode(
5531 0, new_max, is_greedy, body, compiler, on_success, true);
5532 // Unroll the forced matches from 0 to min. This can cause chains of
5533 // TextNodes (which the parser does not generate). These should be
5534 // combined if it turns out they hinder good code generation.
5535 for (int i = 0; i < min; i++) {
5536 answer = body->ToNode(compiler, answer);
5537 }
5538 return answer;
5539 }
5540 }
5541 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5542 DCHECK(max > 0); // Due to the 'if' above.
5543 RegExpExpansionLimiter limiter(compiler, max);
5544 if (limiter.ok_to_expand()) {
5545 // Unroll the optional matches up to max.
5546 RegExpNode* answer = on_success;
5547 for (int i = 0; i < max; i++) {
5548 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5549 if (is_greedy) {
5550 alternation->AddAlternative(
5551 GuardedAlternative(body->ToNode(compiler, answer)));
5552 alternation->AddAlternative(GuardedAlternative(on_success));
5553 } else {
5554 alternation->AddAlternative(GuardedAlternative(on_success));
5555 alternation->AddAlternative(
5556 GuardedAlternative(body->ToNode(compiler, answer)));
5557 }
5558 answer = alternation;
5559 if (not_at_start && !compiler->read_backward()) {
5560 alternation->set_not_at_start();
5561 }
5562 }
5563 return answer;
5564 }
5565 }
5566 }
5567 bool has_min = min > 0;
5568 bool has_max = max < RegExpTree::kInfinity;
5569 bool needs_counter = has_min || has_max;
5570 int reg_ctr = needs_counter
5571 ? compiler->AllocateRegister()
5572 : RegExpCompiler::kNoRegister;
5573 LoopChoiceNode* center = new (zone)
5574 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5575 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5576 RegExpNode* loop_return = needs_counter
5577 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5578 : static_cast<RegExpNode*>(center);
5579 if (body_can_be_empty) {
5580 // If the body can be empty we need to check if it was and then
5581 // backtrack.
5582 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5583 reg_ctr,
5584 min,
5585 loop_return);
5586 }
5587 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5588 if (body_can_be_empty) {
5589 // If the body can be empty we need to store the start position
5590 // so we can bail out if it was empty.
5591 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5592 }
5593 if (needs_capture_clearing) {
5594 // Before entering the body of this loop we need to clear captures.
5595 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5596 }
5597 GuardedAlternative body_alt(body_node);
5598 if (has_max) {
5599 Guard* body_guard =
5600 new(zone) Guard(reg_ctr, Guard::LT, max);
5601 body_alt.AddGuard(body_guard, zone);
5602 }
5603 GuardedAlternative rest_alt(on_success);
5604 if (has_min) {
5605 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5606 rest_alt.AddGuard(rest_guard, zone);
5607 }
5608 if (is_greedy) {
5609 center->AddLoopAlternative(body_alt);
5610 center->AddContinueAlternative(rest_alt);
5611 } else {
5612 center->AddContinueAlternative(rest_alt);
5613 center->AddLoopAlternative(body_alt);
5614 }
5615 if (needs_counter) {
5616 return ActionNode::SetRegister(reg_ctr, 0, center);
5617 } else {
5618 return center;
5619 }
5620 }
5621
5622
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5623 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5624 RegExpNode* on_success) {
5625 NodeInfo info;
5626 Zone* zone = compiler->zone();
5627
5628 switch (assertion_type()) {
5629 case START_OF_LINE:
5630 return AssertionNode::AfterNewline(on_success);
5631 case START_OF_INPUT:
5632 return AssertionNode::AtStart(on_success);
5633 case BOUNDARY:
5634 return AssertionNode::AtBoundary(on_success);
5635 case NON_BOUNDARY:
5636 return AssertionNode::AtNonBoundary(on_success);
5637 case END_OF_INPUT:
5638 return AssertionNode::AtEnd(on_success);
5639 case END_OF_LINE: {
5640 // Compile $ in multiline regexps as an alternation with a positive
5641 // lookahead in one side and an end-of-input on the other side.
5642 // We need two registers for the lookahead.
5643 int stack_pointer_register = compiler->AllocateRegister();
5644 int position_register = compiler->AllocateRegister();
5645 // The ChoiceNode to distinguish between a newline and end-of-input.
5646 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5647 // Create a newline atom.
5648 ZoneList<CharacterRange>* newline_ranges =
5649 new(zone) ZoneList<CharacterRange>(3, zone);
5650 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5651 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5652 TextNode* newline_matcher = new (zone) TextNode(
5653 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5654 stack_pointer_register, position_register,
5655 0, // No captures inside.
5656 -1, // Ignored if no captures.
5657 on_success));
5658 // Create an end-of-input matcher.
5659 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5660 stack_pointer_register,
5661 position_register,
5662 newline_matcher);
5663 // Add the two alternatives to the ChoiceNode.
5664 GuardedAlternative eol_alternative(end_of_line);
5665 result->AddAlternative(eol_alternative);
5666 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5667 result->AddAlternative(end_alternative);
5668 return result;
5669 }
5670 default:
5671 UNREACHABLE();
5672 }
5673 return on_success;
5674 }
5675
5676
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5677 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5678 RegExpNode* on_success) {
5679 return new (compiler->zone())
5680 BackReferenceNode(RegExpCapture::StartRegister(index()),
5681 RegExpCapture::EndRegister(index()),
5682 compiler->read_backward(), on_success);
5683 }
5684
5685
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5686 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5687 RegExpNode* on_success) {
5688 return on_success;
5689 }
5690
5691
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)5692 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5693 int stack_pointer_register,
5694 int position_register,
5695 int capture_register_count,
5696 int capture_register_start)
5697 : is_positive_(is_positive),
5698 on_success_(on_success),
5699 stack_pointer_register_(stack_pointer_register),
5700 position_register_(position_register) {
5701 if (is_positive_) {
5702 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5703 stack_pointer_register, position_register, capture_register_count,
5704 capture_register_start, on_success_);
5705 } else {
5706 Zone* zone = on_success_->zone();
5707 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5708 stack_pointer_register, position_register, capture_register_count,
5709 capture_register_start, zone);
5710 }
5711 }
5712
5713
ForMatch(RegExpNode * match)5714 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5715 if (is_positive_) {
5716 return ActionNode::BeginSubmatch(stack_pointer_register_,
5717 position_register_, match);
5718 } else {
5719 Zone* zone = on_success_->zone();
5720 // We use a ChoiceNode to represent the negative lookaround. The first
5721 // alternative is the negative match. On success, the end node backtracks.
5722 // On failure, the second alternative is tried and leads to success.
5723 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5724 // first exit when calculating quick checks.
5725 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5726 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5727 return ActionNode::BeginSubmatch(stack_pointer_register_,
5728 position_register_, choice_node);
5729 }
5730 }
5731
5732
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5733 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5734 RegExpNode* on_success) {
5735 int stack_pointer_register = compiler->AllocateRegister();
5736 int position_register = compiler->AllocateRegister();
5737
5738 const int registers_per_capture = 2;
5739 const int register_of_first_capture = 2;
5740 int register_count = capture_count_ * registers_per_capture;
5741 int register_start =
5742 register_of_first_capture + capture_from_ * registers_per_capture;
5743
5744 RegExpNode* result;
5745 bool was_reading_backward = compiler->read_backward();
5746 compiler->set_read_backward(type() == LOOKBEHIND);
5747 Builder builder(is_positive(), on_success, stack_pointer_register,
5748 position_register, register_count, register_start);
5749 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5750 result = builder.ForMatch(match);
5751 compiler->set_read_backward(was_reading_backward);
5752 return result;
5753 }
5754
5755
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5756 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5757 RegExpNode* on_success) {
5758 return ToNode(body(), index(), compiler, on_success);
5759 }
5760
5761
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5762 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5763 int index,
5764 RegExpCompiler* compiler,
5765 RegExpNode* on_success) {
5766 DCHECK_NOT_NULL(body);
5767 int start_reg = RegExpCapture::StartRegister(index);
5768 int end_reg = RegExpCapture::EndRegister(index);
5769 if (compiler->read_backward()) std::swap(start_reg, end_reg);
5770 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5771 RegExpNode* body_node = body->ToNode(compiler, store_end);
5772 return ActionNode::StorePosition(start_reg, true, body_node);
5773 }
5774
5775
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5776 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5777 RegExpNode* on_success) {
5778 ZoneList<RegExpTree*>* children = nodes();
5779 RegExpNode* current = on_success;
5780 if (compiler->read_backward()) {
5781 for (int i = 0; i < children->length(); i++) {
5782 current = children->at(i)->ToNode(compiler, current);
5783 }
5784 } else {
5785 for (int i = children->length() - 1; i >= 0; i--) {
5786 current = children->at(i)->ToNode(compiler, current);
5787 }
5788 }
5789 return current;
5790 }
5791
5792
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5793 static void AddClass(const int* elmv,
5794 int elmc,
5795 ZoneList<CharacterRange>* ranges,
5796 Zone* zone) {
5797 elmc--;
5798 DCHECK(elmv[elmc] == kRangeEndMarker);
5799 for (int i = 0; i < elmc; i += 2) {
5800 DCHECK(elmv[i] < elmv[i + 1]);
5801 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5802 }
5803 }
5804
5805
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5806 static void AddClassNegated(const int *elmv,
5807 int elmc,
5808 ZoneList<CharacterRange>* ranges,
5809 Zone* zone) {
5810 elmc--;
5811 DCHECK(elmv[elmc] == kRangeEndMarker);
5812 DCHECK(elmv[0] != 0x0000);
5813 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
5814 uc16 last = 0x0000;
5815 for (int i = 0; i < elmc; i += 2) {
5816 DCHECK(last <= elmv[i] - 1);
5817 DCHECK(elmv[i] < elmv[i + 1]);
5818 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5819 last = elmv[i + 1];
5820 }
5821 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5822 }
5823
5824
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges,Zone * zone)5825 void CharacterRange::AddClassEscape(uc16 type,
5826 ZoneList<CharacterRange>* ranges,
5827 Zone* zone) {
5828 switch (type) {
5829 case 's':
5830 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5831 break;
5832 case 'S':
5833 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5834 break;
5835 case 'w':
5836 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5837 break;
5838 case 'W':
5839 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5840 break;
5841 case 'd':
5842 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5843 break;
5844 case 'D':
5845 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5846 break;
5847 case '.':
5848 AddClassNegated(kLineTerminatorRanges,
5849 kLineTerminatorRangeCount,
5850 ranges,
5851 zone);
5852 break;
5853 // This is not a character range as defined by the spec but a
5854 // convenient shorthand for a character class that matches any
5855 // character.
5856 case '*':
5857 ranges->Add(CharacterRange::Everything(), zone);
5858 break;
5859 // This is the set of characters matched by the $ and ^ symbols
5860 // in multiline mode.
5861 case 'n':
5862 AddClass(kLineTerminatorRanges,
5863 kLineTerminatorRangeCount,
5864 ranges,
5865 zone);
5866 break;
5867 default:
5868 UNREACHABLE();
5869 }
5870 }
5871
5872
GetWordBounds()5873 Vector<const int> CharacterRange::GetWordBounds() {
5874 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5875 }
5876
5877
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)5878 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5879 ZoneList<CharacterRange>* ranges,
5880 bool is_one_byte) {
5881 CharacterRange::Canonicalize(ranges);
5882 int range_count = ranges->length();
5883 for (int i = 0; i < range_count; i++) {
5884 CharacterRange range = ranges->at(i);
5885 uc32 bottom = range.from();
5886 if (bottom > String::kMaxUtf16CodeUnit) return;
5887 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5888 // Nothing to be done for surrogates.
5889 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5890 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5891 if (bottom > String::kMaxOneByteCharCode) return;
5892 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5893 }
5894 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5895 if (top == bottom) {
5896 // If this is a singleton we just expand the one character.
5897 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5898 for (int i = 0; i < length; i++) {
5899 uc32 chr = chars[i];
5900 if (chr != bottom) {
5901 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5902 }
5903 }
5904 } else {
5905 // If this is a range we expand the characters block by block, expanding
5906 // contiguous subranges (blocks) one at a time. The approach is as
5907 // follows. For a given start character we look up the remainder of the
5908 // block that contains it (represented by the end point), for instance we
5909 // find 'z' if the character is 'c'. A block is characterized by the
5910 // property that all characters uncanonicalize in the same way, except
5911 // that each entry in the result is incremented by the distance from the
5912 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5913 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5914 // we've found the end point we look up its uncanonicalization and
5915 // produce a range for each element. For instance for [c-f] we look up
5916 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5917 // it is not already contained in the input, so [c-f] will be skipped but
5918 // [C-F] will be added. If this range is not completely contained in a
5919 // block we do this for all the blocks covered by the range (handling
5920 // characters that is not in a block as a "singleton block").
5921 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5922 int pos = bottom;
5923 while (pos <= top) {
5924 int length =
5925 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5926 uc32 block_end;
5927 if (length == 0) {
5928 block_end = pos;
5929 } else {
5930 DCHECK_EQ(1, length);
5931 block_end = equivalents[0];
5932 }
5933 int end = (block_end > top) ? top : block_end;
5934 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5935 equivalents);
5936 for (int i = 0; i < length; i++) {
5937 uc32 c = equivalents[i];
5938 uc32 range_from = c - (block_end - pos);
5939 uc32 range_to = c - (block_end - end);
5940 if (!(bottom <= range_from && range_to <= top)) {
5941 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5942 }
5943 }
5944 pos = end + 1;
5945 }
5946 }
5947 }
5948 }
5949
5950
IsCanonical(ZoneList<CharacterRange> * ranges)5951 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5952 DCHECK_NOT_NULL(ranges);
5953 int n = ranges->length();
5954 if (n <= 1) return true;
5955 int max = ranges->at(0).to();
5956 for (int i = 1; i < n; i++) {
5957 CharacterRange next_range = ranges->at(i);
5958 if (next_range.from() <= max + 1) return false;
5959 max = next_range.to();
5960 }
5961 return true;
5962 }
5963
5964
ranges(Zone * zone)5965 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5966 if (ranges_ == NULL) {
5967 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5968 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5969 }
5970 return ranges_;
5971 }
5972
5973
5974 // Move a number of elements in a zonelist to another position
5975 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5976 static void MoveRanges(ZoneList<CharacterRange>* list,
5977 int from,
5978 int to,
5979 int count) {
5980 // Ranges are potentially overlapping.
5981 if (from < to) {
5982 for (int i = count - 1; i >= 0; i--) {
5983 list->at(to + i) = list->at(from + i);
5984 }
5985 } else {
5986 for (int i = 0; i < count; i++) {
5987 list->at(to + i) = list->at(from + i);
5988 }
5989 }
5990 }
5991
5992
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)5993 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5994 int count,
5995 CharacterRange insert) {
5996 // Inserts a range into list[0..count[, which must be sorted
5997 // by from value and non-overlapping and non-adjacent, using at most
5998 // list[0..count] for the result. Returns the number of resulting
5999 // canonicalized ranges. Inserting a range may collapse existing ranges into
6000 // fewer ranges, so the return value can be anything in the range 1..count+1.
6001 uc32 from = insert.from();
6002 uc32 to = insert.to();
6003 int start_pos = 0;
6004 int end_pos = count;
6005 for (int i = count - 1; i >= 0; i--) {
6006 CharacterRange current = list->at(i);
6007 if (current.from() > to + 1) {
6008 end_pos = i;
6009 } else if (current.to() + 1 < from) {
6010 start_pos = i + 1;
6011 break;
6012 }
6013 }
6014
6015 // Inserted range overlaps, or is adjacent to, ranges at positions
6016 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6017 // not affected by the insertion.
6018 // If start_pos == end_pos, the range must be inserted before start_pos.
6019 // if start_pos < end_pos, the entire range from start_pos to end_pos
6020 // must be merged with the insert range.
6021
6022 if (start_pos == end_pos) {
6023 // Insert between existing ranges at position start_pos.
6024 if (start_pos < count) {
6025 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6026 }
6027 list->at(start_pos) = insert;
6028 return count + 1;
6029 }
6030 if (start_pos + 1 == end_pos) {
6031 // Replace single existing range at position start_pos.
6032 CharacterRange to_replace = list->at(start_pos);
6033 int new_from = Min(to_replace.from(), from);
6034 int new_to = Max(to_replace.to(), to);
6035 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6036 return count;
6037 }
6038 // Replace a number of existing ranges from start_pos to end_pos - 1.
6039 // Move the remaining ranges down.
6040
6041 int new_from = Min(list->at(start_pos).from(), from);
6042 int new_to = Max(list->at(end_pos - 1).to(), to);
6043 if (end_pos < count) {
6044 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6045 }
6046 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6047 return count - (end_pos - start_pos) + 1;
6048 }
6049
6050
Canonicalize()6051 void CharacterSet::Canonicalize() {
6052 // Special/default classes are always considered canonical. The result
6053 // of calling ranges() will be sorted.
6054 if (ranges_ == NULL) return;
6055 CharacterRange::Canonicalize(ranges_);
6056 }
6057
6058
Canonicalize(ZoneList<CharacterRange> * character_ranges)6059 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6060 if (character_ranges->length() <= 1) return;
6061 // Check whether ranges are already canonical (increasing, non-overlapping,
6062 // non-adjacent).
6063 int n = character_ranges->length();
6064 int max = character_ranges->at(0).to();
6065 int i = 1;
6066 while (i < n) {
6067 CharacterRange current = character_ranges->at(i);
6068 if (current.from() <= max + 1) {
6069 break;
6070 }
6071 max = current.to();
6072 i++;
6073 }
6074 // Canonical until the i'th range. If that's all of them, we are done.
6075 if (i == n) return;
6076
6077 // The ranges at index i and forward are not canonicalized. Make them so by
6078 // doing the equivalent of insertion sort (inserting each into the previous
6079 // list, in order).
6080 // Notice that inserting a range can reduce the number of ranges in the
6081 // result due to combining of adjacent and overlapping ranges.
6082 int read = i; // Range to insert.
6083 int num_canonical = i; // Length of canonicalized part of list.
6084 do {
6085 num_canonical = InsertRangeInCanonicalList(character_ranges,
6086 num_canonical,
6087 character_ranges->at(read));
6088 read++;
6089 } while (read < n);
6090 character_ranges->Rewind(num_canonical);
6091
6092 DCHECK(CharacterRange::IsCanonical(character_ranges));
6093 }
6094
6095
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)6096 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6097 ZoneList<CharacterRange>* negated_ranges,
6098 Zone* zone) {
6099 DCHECK(CharacterRange::IsCanonical(ranges));
6100 DCHECK_EQ(0, negated_ranges->length());
6101 int range_count = ranges->length();
6102 uc32 from = 0;
6103 int i = 0;
6104 if (range_count > 0 && ranges->at(0).from() == 0) {
6105 from = ranges->at(0).to() + 1;
6106 i = 1;
6107 }
6108 while (i < range_count) {
6109 CharacterRange range = ranges->at(i);
6110 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6111 from = range.to() + 1;
6112 i++;
6113 }
6114 if (from < String::kMaxCodePoint) {
6115 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6116 zone);
6117 }
6118 }
6119
6120
6121 // -------------------------------------------------------------------
6122 // Splay tree
6123
6124
Extend(unsigned value,Zone * zone)6125 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6126 if (Get(value))
6127 return this;
6128 if (successors(zone) != NULL) {
6129 for (int i = 0; i < successors(zone)->length(); i++) {
6130 OutSet* successor = successors(zone)->at(i);
6131 if (successor->Get(value))
6132 return successor;
6133 }
6134 } else {
6135 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6136 }
6137 OutSet* result = new(zone) OutSet(first_, remaining_);
6138 result->Set(value, zone);
6139 successors(zone)->Add(result, zone);
6140 return result;
6141 }
6142
6143
Set(unsigned value,Zone * zone)6144 void OutSet::Set(unsigned value, Zone *zone) {
6145 if (value < kFirstLimit) {
6146 first_ |= (1 << value);
6147 } else {
6148 if (remaining_ == NULL)
6149 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6150 if (remaining_->is_empty() || !remaining_->Contains(value))
6151 remaining_->Add(value, zone);
6152 }
6153 }
6154
6155
Get(unsigned value) const6156 bool OutSet::Get(unsigned value) const {
6157 if (value < kFirstLimit) {
6158 return (first_ & (1 << value)) != 0;
6159 } else if (remaining_ == NULL) {
6160 return false;
6161 } else {
6162 return remaining_->Contains(value);
6163 }
6164 }
6165
6166
6167 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6168
6169
AddRange(CharacterRange full_range,int value,Zone * zone)6170 void DispatchTable::AddRange(CharacterRange full_range, int value,
6171 Zone* zone) {
6172 CharacterRange current = full_range;
6173 if (tree()->is_empty()) {
6174 // If this is the first range we just insert into the table.
6175 ZoneSplayTree<Config>::Locator loc;
6176 bool inserted = tree()->Insert(current.from(), &loc);
6177 DCHECK(inserted);
6178 USE(inserted);
6179 loc.set_value(Entry(current.from(), current.to(),
6180 empty()->Extend(value, zone)));
6181 return;
6182 }
6183 // First see if there is a range to the left of this one that
6184 // overlaps.
6185 ZoneSplayTree<Config>::Locator loc;
6186 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6187 Entry* entry = &loc.value();
6188 // If we've found a range that overlaps with this one, and it
6189 // starts strictly to the left of this one, we have to fix it
6190 // because the following code only handles ranges that start on
6191 // or after the start point of the range we're adding.
6192 if (entry->from() < current.from() && entry->to() >= current.from()) {
6193 // Snap the overlapping range in half around the start point of
6194 // the range we're adding.
6195 CharacterRange left =
6196 CharacterRange::Range(entry->from(), current.from() - 1);
6197 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6198 // The left part of the overlapping range doesn't overlap.
6199 // Truncate the whole entry to be just the left part.
6200 entry->set_to(left.to());
6201 // The right part is the one that overlaps. We add this part
6202 // to the map and let the next step deal with merging it with
6203 // the range we're adding.
6204 ZoneSplayTree<Config>::Locator loc;
6205 bool inserted = tree()->Insert(right.from(), &loc);
6206 DCHECK(inserted);
6207 USE(inserted);
6208 loc.set_value(Entry(right.from(),
6209 right.to(),
6210 entry->out_set()));
6211 }
6212 }
6213 while (current.is_valid()) {
6214 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6215 (loc.value().from() <= current.to()) &&
6216 (loc.value().to() >= current.from())) {
6217 Entry* entry = &loc.value();
6218 // We have overlap. If there is space between the start point of
6219 // the range we're adding and where the overlapping range starts
6220 // then we have to add a range covering just that space.
6221 if (current.from() < entry->from()) {
6222 ZoneSplayTree<Config>::Locator ins;
6223 bool inserted = tree()->Insert(current.from(), &ins);
6224 DCHECK(inserted);
6225 USE(inserted);
6226 ins.set_value(Entry(current.from(),
6227 entry->from() - 1,
6228 empty()->Extend(value, zone)));
6229 current.set_from(entry->from());
6230 }
6231 DCHECK_EQ(current.from(), entry->from());
6232 // If the overlapping range extends beyond the one we want to add
6233 // we have to snap the right part off and add it separately.
6234 if (entry->to() > current.to()) {
6235 ZoneSplayTree<Config>::Locator ins;
6236 bool inserted = tree()->Insert(current.to() + 1, &ins);
6237 DCHECK(inserted);
6238 USE(inserted);
6239 ins.set_value(Entry(current.to() + 1,
6240 entry->to(),
6241 entry->out_set()));
6242 entry->set_to(current.to());
6243 }
6244 DCHECK(entry->to() <= current.to());
6245 // The overlapping range is now completely contained by the range
6246 // we're adding so we can just update it and move the start point
6247 // of the range we're adding just past it.
6248 entry->AddValue(value, zone);
6249 DCHECK(entry->to() + 1 > current.from());
6250 current.set_from(entry->to() + 1);
6251 } else {
6252 // There is no overlap so we can just add the range
6253 ZoneSplayTree<Config>::Locator ins;
6254 bool inserted = tree()->Insert(current.from(), &ins);
6255 DCHECK(inserted);
6256 USE(inserted);
6257 ins.set_value(Entry(current.from(),
6258 current.to(),
6259 empty()->Extend(value, zone)));
6260 break;
6261 }
6262 }
6263 }
6264
6265
Get(uc32 value)6266 OutSet* DispatchTable::Get(uc32 value) {
6267 ZoneSplayTree<Config>::Locator loc;
6268 if (!tree()->FindGreatestLessThan(value, &loc))
6269 return empty();
6270 Entry* entry = &loc.value();
6271 if (value <= entry->to())
6272 return entry->out_set();
6273 else
6274 return empty();
6275 }
6276
6277
6278 // -------------------------------------------------------------------
6279 // Analysis
6280
6281
EnsureAnalyzed(RegExpNode * that)6282 void Analysis::EnsureAnalyzed(RegExpNode* that) {
6283 StackLimitCheck check(isolate());
6284 if (check.HasOverflowed()) {
6285 fail("Stack overflow");
6286 return;
6287 }
6288 if (that->info()->been_analyzed || that->info()->being_analyzed)
6289 return;
6290 that->info()->being_analyzed = true;
6291 that->Accept(this);
6292 that->info()->being_analyzed = false;
6293 that->info()->been_analyzed = true;
6294 }
6295
6296
VisitEnd(EndNode * that)6297 void Analysis::VisitEnd(EndNode* that) {
6298 // nothing to do
6299 }
6300
6301
CalculateOffsets()6302 void TextNode::CalculateOffsets() {
6303 int element_count = elements()->length();
6304 // Set up the offsets of the elements relative to the start. This is a fixed
6305 // quantity since a TextNode can only contain fixed-width things.
6306 int cp_offset = 0;
6307 for (int i = 0; i < element_count; i++) {
6308 TextElement& elm = elements()->at(i);
6309 elm.set_cp_offset(cp_offset);
6310 cp_offset += elm.length();
6311 }
6312 }
6313
6314
VisitText(TextNode * that)6315 void Analysis::VisitText(TextNode* that) {
6316 if (ignore_case()) {
6317 that->MakeCaseIndependent(isolate(), is_one_byte_);
6318 }
6319 EnsureAnalyzed(that->on_success());
6320 if (!has_failed()) {
6321 that->CalculateOffsets();
6322 }
6323 }
6324
6325
VisitAction(ActionNode * that)6326 void Analysis::VisitAction(ActionNode* that) {
6327 RegExpNode* target = that->on_success();
6328 EnsureAnalyzed(target);
6329 if (!has_failed()) {
6330 // If the next node is interested in what it follows then this node
6331 // has to be interested too so it can pass the information on.
6332 that->info()->AddFromFollowing(target->info());
6333 }
6334 }
6335
6336
VisitChoice(ChoiceNode * that)6337 void Analysis::VisitChoice(ChoiceNode* that) {
6338 NodeInfo* info = that->info();
6339 for (int i = 0; i < that->alternatives()->length(); i++) {
6340 RegExpNode* node = that->alternatives()->at(i).node();
6341 EnsureAnalyzed(node);
6342 if (has_failed()) return;
6343 // Anything the following nodes need to know has to be known by
6344 // this node also, so it can pass it on.
6345 info->AddFromFollowing(node->info());
6346 }
6347 }
6348
6349
VisitLoopChoice(LoopChoiceNode * that)6350 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6351 NodeInfo* info = that->info();
6352 for (int i = 0; i < that->alternatives()->length(); i++) {
6353 RegExpNode* node = that->alternatives()->at(i).node();
6354 if (node != that->loop_node()) {
6355 EnsureAnalyzed(node);
6356 if (has_failed()) return;
6357 info->AddFromFollowing(node->info());
6358 }
6359 }
6360 // Check the loop last since it may need the value of this node
6361 // to get a correct result.
6362 EnsureAnalyzed(that->loop_node());
6363 if (!has_failed()) {
6364 info->AddFromFollowing(that->loop_node()->info());
6365 }
6366 }
6367
6368
VisitBackReference(BackReferenceNode * that)6369 void Analysis::VisitBackReference(BackReferenceNode* that) {
6370 EnsureAnalyzed(that->on_success());
6371 }
6372
6373
VisitAssertion(AssertionNode * that)6374 void Analysis::VisitAssertion(AssertionNode* that) {
6375 EnsureAnalyzed(that->on_success());
6376 }
6377
6378
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6379 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6380 BoyerMooreLookahead* bm,
6381 bool not_at_start) {
6382 // Working out the set of characters that a backreference can match is too
6383 // hard, so we just say that any character can match.
6384 bm->SetRest(offset);
6385 SaveBMInfo(bm, not_at_start, offset);
6386 }
6387
6388
6389 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6390 RegExpMacroAssembler::kTableSize);
6391
6392
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6393 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6394 BoyerMooreLookahead* bm, bool not_at_start) {
6395 ZoneList<GuardedAlternative>* alts = alternatives();
6396 budget = (budget - 1) / alts->length();
6397 for (int i = 0; i < alts->length(); i++) {
6398 GuardedAlternative& alt = alts->at(i);
6399 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6400 bm->SetRest(offset); // Give up trying to fill in info.
6401 SaveBMInfo(bm, not_at_start, offset);
6402 return;
6403 }
6404 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6405 }
6406 SaveBMInfo(bm, not_at_start, offset);
6407 }
6408
6409
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6410 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6411 BoyerMooreLookahead* bm, bool not_at_start) {
6412 if (initial_offset >= bm->length()) return;
6413 int offset = initial_offset;
6414 int max_char = bm->max_char();
6415 for (int i = 0; i < elements()->length(); i++) {
6416 if (offset >= bm->length()) {
6417 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6418 return;
6419 }
6420 TextElement text = elements()->at(i);
6421 if (text.text_type() == TextElement::ATOM) {
6422 RegExpAtom* atom = text.atom();
6423 for (int j = 0; j < atom->length(); j++, offset++) {
6424 if (offset >= bm->length()) {
6425 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6426 return;
6427 }
6428 uc16 character = atom->data()[j];
6429 if (bm->compiler()->ignore_case()) {
6430 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6431 int length = GetCaseIndependentLetters(
6432 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6433 chars);
6434 for (int j = 0; j < length; j++) {
6435 bm->Set(offset, chars[j]);
6436 }
6437 } else {
6438 if (character <= max_char) bm->Set(offset, character);
6439 }
6440 }
6441 } else {
6442 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6443 RegExpCharacterClass* char_class = text.char_class();
6444 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6445 if (char_class->is_negated()) {
6446 bm->SetAll(offset);
6447 } else {
6448 for (int k = 0; k < ranges->length(); k++) {
6449 CharacterRange& range = ranges->at(k);
6450 if (range.from() > max_char) continue;
6451 int to = Min(max_char, static_cast<int>(range.to()));
6452 bm->SetInterval(offset, Interval(range.from(), to));
6453 }
6454 }
6455 offset++;
6456 }
6457 }
6458 if (offset >= bm->length()) {
6459 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6460 return;
6461 }
6462 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6463 true); // Not at start after a text node.
6464 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6465 }
6466
6467
6468 // -------------------------------------------------------------------
6469 // Dispatch table construction
6470
6471
VisitEnd(EndNode * that)6472 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6473 AddRange(CharacterRange::Everything());
6474 }
6475
6476
BuildTable(ChoiceNode * node)6477 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6478 node->set_being_calculated(true);
6479 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6480 for (int i = 0; i < alternatives->length(); i++) {
6481 set_choice_index(i);
6482 alternatives->at(i).node()->Accept(this);
6483 }
6484 node->set_being_calculated(false);
6485 }
6486
6487
6488 class AddDispatchRange {
6489 public:
AddDispatchRange(DispatchTableConstructor * constructor)6490 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6491 : constructor_(constructor) { }
6492 void Call(uc32 from, DispatchTable::Entry entry);
6493 private:
6494 DispatchTableConstructor* constructor_;
6495 };
6496
6497
Call(uc32 from,DispatchTable::Entry entry)6498 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6499 constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6500 }
6501
6502
VisitChoice(ChoiceNode * node)6503 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6504 if (node->being_calculated())
6505 return;
6506 DispatchTable* table = node->GetTable(ignore_case_);
6507 AddDispatchRange adder(this);
6508 table->ForEach(&adder);
6509 }
6510
6511
VisitBackReference(BackReferenceNode * that)6512 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6513 // TODO(160): Find the node that we refer back to and propagate its start
6514 // set back to here. For now we just accept anything.
6515 AddRange(CharacterRange::Everything());
6516 }
6517
6518
VisitAssertion(AssertionNode * that)6519 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6520 RegExpNode* target = that->on_success();
6521 target->Accept(this);
6522 }
6523
6524
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)6525 static int CompareRangeByFrom(const CharacterRange* a,
6526 const CharacterRange* b) {
6527 return Compare<uc16>(a->from(), b->from());
6528 }
6529
6530
AddInverse(ZoneList<CharacterRange> * ranges)6531 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6532 ranges->Sort(CompareRangeByFrom);
6533 uc16 last = 0;
6534 for (int i = 0; i < ranges->length(); i++) {
6535 CharacterRange range = ranges->at(i);
6536 if (last < range.from())
6537 AddRange(CharacterRange::Range(last, range.from() - 1));
6538 if (range.to() >= last) {
6539 if (range.to() == String::kMaxCodePoint) {
6540 return;
6541 } else {
6542 last = range.to() + 1;
6543 }
6544 }
6545 }
6546 AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6547 }
6548
6549
VisitText(TextNode * that)6550 void DispatchTableConstructor::VisitText(TextNode* that) {
6551 TextElement elm = that->elements()->at(0);
6552 switch (elm.text_type()) {
6553 case TextElement::ATOM: {
6554 uc16 c = elm.atom()->data()[0];
6555 AddRange(CharacterRange::Range(c, c));
6556 break;
6557 }
6558 case TextElement::CHAR_CLASS: {
6559 RegExpCharacterClass* tree = elm.char_class();
6560 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6561 if (tree->is_negated()) {
6562 AddInverse(ranges);
6563 } else {
6564 for (int i = 0; i < ranges->length(); i++)
6565 AddRange(ranges->at(i));
6566 }
6567 break;
6568 }
6569 default: {
6570 UNIMPLEMENTED();
6571 }
6572 }
6573 }
6574
6575
VisitAction(ActionNode * that)6576 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6577 RegExpNode* target = that->on_success();
6578 target->Accept(this);
6579 }
6580
6581
OptionallyStepBackToLeadSurrogate(RegExpCompiler * compiler,RegExpNode * on_success)6582 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6583 RegExpNode* on_success) {
6584 // If the regexp matching starts within a surrogate pair, step back
6585 // to the lead surrogate and start matching from there.
6586 DCHECK(!compiler->read_backward());
6587 Zone* zone = compiler->zone();
6588 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6589 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6590 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6591 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6592
6593 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6594
6595 int stack_register = compiler->UnicodeLookaroundStackRegister();
6596 int position_register = compiler->UnicodeLookaroundPositionRegister();
6597 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6598 zone, lead_surrogates, true, on_success);
6599 RegExpLookaround::Builder builder(true, step_back, stack_register,
6600 position_register);
6601 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6602 zone, trail_surrogates, false, builder.on_match_success());
6603
6604 optional_step_back->AddAlternative(
6605 GuardedAlternative(builder.ForMatch(match_trail)));
6606 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6607
6608 return optional_step_back;
6609 }
6610
6611
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)6612 RegExpEngine::CompilationResult RegExpEngine::Compile(
6613 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6614 JSRegExp::Flags flags, Handle<String> pattern,
6615 Handle<String> sample_subject, bool is_one_byte) {
6616 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6617 return IrregexpRegExpTooBig(isolate);
6618 }
6619 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6620 bool is_sticky = flags & JSRegExp::kSticky;
6621 bool is_global = flags & JSRegExp::kGlobal;
6622 bool is_unicode = flags & JSRegExp::kUnicode;
6623 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6624 is_one_byte);
6625
6626 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6627
6628 // Sample some characters from the middle of the string.
6629 static const int kSampleSize = 128;
6630
6631 sample_subject = String::Flatten(sample_subject);
6632 int chars_sampled = 0;
6633 int half_way = (sample_subject->length() - kSampleSize) / 2;
6634 for (int i = Max(0, half_way);
6635 i < sample_subject->length() && chars_sampled < kSampleSize;
6636 i++, chars_sampled++) {
6637 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6638 }
6639
6640 // Wrap the body of the regexp in capture #0.
6641 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6642 0,
6643 &compiler,
6644 compiler.accept());
6645 RegExpNode* node = captured_body;
6646 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6647 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6648 int max_length = data->tree->max_match();
6649 if (!is_start_anchored && !is_sticky) {
6650 // Add a .*? at the beginning, outside the body capture, unless
6651 // this expression is anchored at the beginning or sticky.
6652 RegExpNode* loop_node = RegExpQuantifier::ToNode(
6653 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6654 &compiler, captured_body, data->contains_anchor);
6655
6656 if (data->contains_anchor) {
6657 // Unroll loop once, to take care of the case that might start
6658 // at the start of input.
6659 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6660 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6661 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6662 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6663 node = first_step_node;
6664 } else {
6665 node = loop_node;
6666 }
6667 }
6668 if (is_one_byte) {
6669 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6670 // Do it again to propagate the new nodes to places where they were not
6671 // put because they had not been calculated yet.
6672 if (node != NULL) {
6673 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6674 }
6675 } else if (compiler.unicode() && (is_global || is_sticky)) {
6676 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6677 }
6678
6679 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6680 data->node = node;
6681 Analysis analysis(isolate, flags, is_one_byte);
6682 analysis.EnsureAnalyzed(node);
6683 if (analysis.has_failed()) {
6684 const char* error_message = analysis.error_message();
6685 return CompilationResult(isolate, error_message);
6686 }
6687
6688 // Create the correct assembler for the architecture.
6689 #ifndef V8_INTERPRETED_REGEXP
6690 // Native regexp implementation.
6691
6692 NativeRegExpMacroAssembler::Mode mode =
6693 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6694 : NativeRegExpMacroAssembler::UC16;
6695
6696 #if V8_TARGET_ARCH_IA32
6697 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6698 (data->capture_count + 1) * 2);
6699 #elif V8_TARGET_ARCH_X64
6700 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6701 (data->capture_count + 1) * 2);
6702 #elif V8_TARGET_ARCH_ARM
6703 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6704 (data->capture_count + 1) * 2);
6705 #elif V8_TARGET_ARCH_ARM64
6706 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6707 (data->capture_count + 1) * 2);
6708 #elif V8_TARGET_ARCH_S390
6709 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6710 (data->capture_count + 1) * 2);
6711 #elif V8_TARGET_ARCH_PPC
6712 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6713 (data->capture_count + 1) * 2);
6714 #elif V8_TARGET_ARCH_MIPS
6715 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6716 (data->capture_count + 1) * 2);
6717 #elif V8_TARGET_ARCH_MIPS64
6718 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6719 (data->capture_count + 1) * 2);
6720 #elif V8_TARGET_ARCH_X87
6721 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6722 (data->capture_count + 1) * 2);
6723 #else
6724 #error "Unsupported architecture"
6725 #endif
6726
6727 #else // V8_INTERPRETED_REGEXP
6728 // Interpreted regexp implementation.
6729 EmbeddedVector<byte, 1024> codes;
6730 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6731 #endif // V8_INTERPRETED_REGEXP
6732
6733 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6734
6735 // Inserted here, instead of in Assembler, because it depends on information
6736 // in the AST that isn't replicated in the Node structure.
6737 static const int kMaxBacksearchLimit = 1024;
6738 if (is_end_anchored && !is_start_anchored && !is_sticky &&
6739 max_length < kMaxBacksearchLimit) {
6740 macro_assembler.SetCurrentPositionFromEnd(max_length);
6741 }
6742
6743 if (is_global) {
6744 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6745 if (data->tree->min_match() > 0) {
6746 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6747 } else if (is_unicode) {
6748 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6749 }
6750 macro_assembler.set_global_mode(mode);
6751 }
6752
6753 return compiler.Assemble(¯o_assembler,
6754 node,
6755 data->capture_count,
6756 pattern);
6757 }
6758
6759
TooMuchRegExpCode(Handle<String> pattern)6760 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6761 Heap* heap = pattern->GetHeap();
6762 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6763 if (heap->isolate()->total_regexp_code_generated() >
6764 RegExpImpl::kRegExpCompiledLimit &&
6765 heap->CommittedMemoryExecutable() >
6766 RegExpImpl::kRegExpExecutableMemoryLimit) {
6767 too_much = true;
6768 }
6769 return too_much;
6770 }
6771
6772
Lookup(Heap * heap,String * key_string,Object * key_pattern,FixedArray ** last_match_cache,ResultsCacheType type)6773 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6774 Object* key_pattern,
6775 FixedArray** last_match_cache,
6776 ResultsCacheType type) {
6777 FixedArray* cache;
6778 if (!key_string->IsInternalizedString()) return Smi::kZero;
6779 if (type == STRING_SPLIT_SUBSTRINGS) {
6780 DCHECK(key_pattern->IsString());
6781 if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6782 cache = heap->string_split_cache();
6783 } else {
6784 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6785 DCHECK(key_pattern->IsFixedArray());
6786 cache = heap->regexp_multiple_cache();
6787 }
6788
6789 uint32_t hash = key_string->Hash();
6790 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6791 ~(kArrayEntriesPerCacheEntry - 1));
6792 if (cache->get(index + kStringOffset) != key_string ||
6793 cache->get(index + kPatternOffset) != key_pattern) {
6794 index =
6795 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6796 if (cache->get(index + kStringOffset) != key_string ||
6797 cache->get(index + kPatternOffset) != key_pattern) {
6798 return Smi::kZero;
6799 }
6800 }
6801
6802 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6803 return cache->get(index + kArrayOffset);
6804 }
6805
6806
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)6807 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6808 Handle<Object> key_pattern,
6809 Handle<FixedArray> value_array,
6810 Handle<FixedArray> last_match_cache,
6811 ResultsCacheType type) {
6812 Factory* factory = isolate->factory();
6813 Handle<FixedArray> cache;
6814 if (!key_string->IsInternalizedString()) return;
6815 if (type == STRING_SPLIT_SUBSTRINGS) {
6816 DCHECK(key_pattern->IsString());
6817 if (!key_pattern->IsInternalizedString()) return;
6818 cache = factory->string_split_cache();
6819 } else {
6820 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6821 DCHECK(key_pattern->IsFixedArray());
6822 cache = factory->regexp_multiple_cache();
6823 }
6824
6825 uint32_t hash = key_string->Hash();
6826 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6827 ~(kArrayEntriesPerCacheEntry - 1));
6828 if (cache->get(index + kStringOffset) == Smi::kZero) {
6829 cache->set(index + kStringOffset, *key_string);
6830 cache->set(index + kPatternOffset, *key_pattern);
6831 cache->set(index + kArrayOffset, *value_array);
6832 cache->set(index + kLastMatchOffset, *last_match_cache);
6833 } else {
6834 uint32_t index2 =
6835 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6836 if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6837 cache->set(index2 + kStringOffset, *key_string);
6838 cache->set(index2 + kPatternOffset, *key_pattern);
6839 cache->set(index2 + kArrayOffset, *value_array);
6840 cache->set(index2 + kLastMatchOffset, *last_match_cache);
6841 } else {
6842 cache->set(index2 + kStringOffset, Smi::kZero);
6843 cache->set(index2 + kPatternOffset, Smi::kZero);
6844 cache->set(index2 + kArrayOffset, Smi::kZero);
6845 cache->set(index2 + kLastMatchOffset, Smi::kZero);
6846 cache->set(index + kStringOffset, *key_string);
6847 cache->set(index + kPatternOffset, *key_pattern);
6848 cache->set(index + kArrayOffset, *value_array);
6849 cache->set(index + kLastMatchOffset, *last_match_cache);
6850 }
6851 }
6852 // If the array is a reasonably short list of substrings, convert it into a
6853 // list of internalized strings.
6854 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6855 for (int i = 0; i < value_array->length(); i++) {
6856 Handle<String> str(String::cast(value_array->get(i)), isolate);
6857 Handle<String> internalized_str = factory->InternalizeString(str);
6858 value_array->set(i, *internalized_str);
6859 }
6860 }
6861 // Convert backing store to a copy-on-write array.
6862 value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map());
6863 }
6864
6865
Clear(FixedArray * cache)6866 void RegExpResultsCache::Clear(FixedArray* cache) {
6867 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6868 cache->set(i, Smi::kZero);
6869 }
6870 }
6871
6872 } // namespace internal
6873 } // namespace v8
6874