1 /*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "YarrInterpreter.h"
29
30 #include "Yarr.h"
31 #include <wtf/BumpPointerAllocator.h>
32
33 #ifndef NDEBUG
34 #include <stdio.h>
35 #endif
36
37 using namespace WTF;
38
39 namespace JSC { namespace Yarr {
40
41 class Interpreter {
42 public:
43 struct ParenthesesDisjunctionContext;
44
45 struct BackTrackInfoPatternCharacter {
46 uintptr_t matchAmount;
47 };
48 struct BackTrackInfoCharacterClass {
49 uintptr_t matchAmount;
50 };
51 struct BackTrackInfoBackReference {
52 uintptr_t begin; // Not really needed for greedy quantifiers.
53 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
54 };
55 struct BackTrackInfoAlternative {
56 uintptr_t offset;
57 };
58 struct BackTrackInfoParentheticalAssertion {
59 uintptr_t begin;
60 };
61 struct BackTrackInfoParenthesesOnce {
62 uintptr_t begin;
63 };
64 struct BackTrackInfoParenthesesTerminal {
65 uintptr_t begin;
66 };
67 struct BackTrackInfoParentheses {
68 uintptr_t matchAmount;
69 ParenthesesDisjunctionContext* lastContext;
70 };
71
appendParenthesesDisjunctionContext(BackTrackInfoParentheses * backTrack,ParenthesesDisjunctionContext * context)72 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
73 {
74 context->next = backTrack->lastContext;
75 backTrack->lastContext = context;
76 ++backTrack->matchAmount;
77 }
78
popParenthesesDisjunctionContext(BackTrackInfoParentheses * backTrack)79 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
80 {
81 ASSERT(backTrack->matchAmount);
82 ASSERT(backTrack->lastContext);
83 backTrack->lastContext = backTrack->lastContext->next;
84 --backTrack->matchAmount;
85 }
86
87 struct DisjunctionContext
88 {
DisjunctionContextJSC::Yarr::Interpreter::DisjunctionContext89 DisjunctionContext()
90 : term(0)
91 {
92 }
93
operator newJSC::Yarr::Interpreter::DisjunctionContext94 void* operator new(size_t, void* where)
95 {
96 return where;
97 }
98
99 int term;
100 unsigned matchBegin;
101 unsigned matchEnd;
102 uintptr_t frame[1];
103 };
104
allocDisjunctionContext(ByteDisjunction * disjunction)105 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
106 {
107 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
108 allocatorPool = allocatorPool->ensureCapacity(size);
109 if (!allocatorPool)
110 CRASH();
111 return new(allocatorPool->alloc(size)) DisjunctionContext();
112 }
113
freeDisjunctionContext(DisjunctionContext * context)114 void freeDisjunctionContext(DisjunctionContext* context)
115 {
116 allocatorPool = allocatorPool->dealloc(context);
117 }
118
119 struct ParenthesesDisjunctionContext
120 {
ParenthesesDisjunctionContextJSC::Yarr::Interpreter::ParenthesesDisjunctionContext121 ParenthesesDisjunctionContext(int* output, ByteTerm& term)
122 : next(0)
123 {
124 unsigned firstSubpatternId = term.atom.subpatternId;
125 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
126
127 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
128 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
129 output[(firstSubpatternId << 1) + i] = -1;
130 }
131
132 new(getDisjunctionContext(term)) DisjunctionContext();
133 }
134
operator newJSC::Yarr::Interpreter::ParenthesesDisjunctionContext135 void* operator new(size_t, void* where)
136 {
137 return where;
138 }
139
restoreOutputJSC::Yarr::Interpreter::ParenthesesDisjunctionContext140 void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
141 {
142 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
143 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
144 }
145
getDisjunctionContextJSC::Yarr::Interpreter::ParenthesesDisjunctionContext146 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
147 {
148 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
149 }
150
151 ParenthesesDisjunctionContext* next;
152 int subpatternBackup[1];
153 };
154
allocParenthesesDisjunctionContext(ByteDisjunction * disjunction,int * output,ByteTerm & term)155 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
156 {
157 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
158 allocatorPool = allocatorPool->ensureCapacity(size);
159 if (!allocatorPool)
160 CRASH();
161 return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
162 }
163
freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext * context)164 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
165 {
166 allocatorPool = allocatorPool->dealloc(context);
167 }
168
169 class InputStream {
170 public:
InputStream(const UChar * input,unsigned start,unsigned length)171 InputStream(const UChar* input, unsigned start, unsigned length)
172 : input(input)
173 , pos(start)
174 , length(length)
175 {
176 }
177
next()178 void next()
179 {
180 ++pos;
181 }
182
rewind(unsigned amount)183 void rewind(unsigned amount)
184 {
185 ASSERT(pos >= amount);
186 pos -= amount;
187 }
188
read()189 int read()
190 {
191 ASSERT(pos < length);
192 if (pos < length)
193 return input[pos];
194 return -1;
195 }
196
readPair()197 int readPair()
198 {
199 ASSERT(pos + 1 < length);
200 return input[pos] | input[pos + 1] << 16;
201 }
202
readChecked(int position)203 int readChecked(int position)
204 {
205 ASSERT(position < 0);
206 ASSERT(static_cast<unsigned>(-position) <= pos);
207 unsigned p = pos + position;
208 ASSERT(p < length);
209 return input[p];
210 }
211
reread(unsigned from)212 int reread(unsigned from)
213 {
214 ASSERT(from < length);
215 return input[from];
216 }
217
prev()218 int prev()
219 {
220 ASSERT(!(pos > length));
221 if (pos && length)
222 return input[pos - 1];
223 return -1;
224 }
225
getPos()226 unsigned getPos()
227 {
228 return pos;
229 }
230
setPos(unsigned p)231 void setPos(unsigned p)
232 {
233 pos = p;
234 }
235
atStart()236 bool atStart()
237 {
238 return pos == 0;
239 }
240
atEnd()241 bool atEnd()
242 {
243 return pos == length;
244 }
245
checkInput(int count)246 bool checkInput(int count)
247 {
248 if ((pos + count) <= length) {
249 pos += count;
250 return true;
251 }
252 return false;
253 }
254
uncheckInput(int count)255 void uncheckInput(int count)
256 {
257 pos -= count;
258 }
259
atStart(int position)260 bool atStart(int position)
261 {
262 return (pos + position) == 0;
263 }
264
atEnd(int position)265 bool atEnd(int position)
266 {
267 return (pos + position) == length;
268 }
269
isNotAvailableInput(int position)270 bool isNotAvailableInput(int position)
271 {
272 return (pos + position) > length;
273 }
274
275 private:
276 const UChar* input;
277 unsigned pos;
278 unsigned length;
279 };
280
testCharacterClass(CharacterClass * characterClass,int ch)281 bool testCharacterClass(CharacterClass* characterClass, int ch)
282 {
283 if (ch & 0xFF80) {
284 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
285 if (ch == characterClass->m_matchesUnicode[i])
286 return true;
287 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
288 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
289 return true;
290 } else {
291 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
292 if (ch == characterClass->m_matches[i])
293 return true;
294 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
295 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
296 return true;
297 }
298
299 return false;
300 }
301
checkCharacter(int testChar,int inputPosition)302 bool checkCharacter(int testChar, int inputPosition)
303 {
304 return testChar == input.readChecked(inputPosition);
305 }
306
checkCasedCharacter(int loChar,int hiChar,int inputPosition)307 bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
308 {
309 int ch = input.readChecked(inputPosition);
310 return (loChar == ch) || (hiChar == ch);
311 }
312
checkCharacterClass(CharacterClass * characterClass,bool invert,int inputPosition)313 bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
314 {
315 bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
316 return invert ? !match : match;
317 }
318
tryConsumeBackReference(int matchBegin,int matchEnd,int inputOffset)319 bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
320 {
321 int matchSize = matchEnd - matchBegin;
322
323 if (!input.checkInput(matchSize))
324 return false;
325
326 if (pattern->m_ignoreCase) {
327 for (int i = 0; i < matchSize; ++i) {
328 int ch = input.reread(matchBegin + i);
329
330 int lo = Unicode::toLower(ch);
331 int hi = Unicode::toUpper(ch);
332
333 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
334 input.uncheckInput(matchSize);
335 return false;
336 }
337 }
338 } else {
339 for (int i = 0; i < matchSize; ++i) {
340 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
341 input.uncheckInput(matchSize);
342 return false;
343 }
344 }
345 }
346
347 return true;
348 }
349
matchAssertionBOL(ByteTerm & term)350 bool matchAssertionBOL(ByteTerm& term)
351 {
352 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
353 }
354
matchAssertionEOL(ByteTerm & term)355 bool matchAssertionEOL(ByteTerm& term)
356 {
357 if (term.inputPosition)
358 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
359
360 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
361 }
362
matchAssertionWordBoundary(ByteTerm & term)363 bool matchAssertionWordBoundary(ByteTerm& term)
364 {
365 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
366 bool readIsWordchar;
367 if (term.inputPosition)
368 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
369 else
370 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
371
372 bool wordBoundary = prevIsWordchar != readIsWordchar;
373 return term.invert() ? !wordBoundary : wordBoundary;
374 }
375
backtrackPatternCharacter(ByteTerm & term,DisjunctionContext * context)376 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
377 {
378 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
379
380 switch (term.atom.quantityType) {
381 case QuantifierFixedCount:
382 break;
383
384 case QuantifierGreedy:
385 if (backTrack->matchAmount) {
386 --backTrack->matchAmount;
387 input.uncheckInput(1);
388 return true;
389 }
390 break;
391
392 case QuantifierNonGreedy:
393 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
394 ++backTrack->matchAmount;
395 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
396 return true;
397 }
398 input.uncheckInput(backTrack->matchAmount);
399 break;
400 }
401
402 return false;
403 }
404
backtrackPatternCasedCharacter(ByteTerm & term,DisjunctionContext * context)405 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
406 {
407 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
408
409 switch (term.atom.quantityType) {
410 case QuantifierFixedCount:
411 break;
412
413 case QuantifierGreedy:
414 if (backTrack->matchAmount) {
415 --backTrack->matchAmount;
416 input.uncheckInput(1);
417 return true;
418 }
419 break;
420
421 case QuantifierNonGreedy:
422 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
423 ++backTrack->matchAmount;
424 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
425 return true;
426 }
427 input.uncheckInput(backTrack->matchAmount);
428 break;
429 }
430
431 return false;
432 }
433
matchCharacterClass(ByteTerm & term,DisjunctionContext * context)434 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
435 {
436 ASSERT(term.type == ByteTerm::TypeCharacterClass);
437 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
438
439 switch (term.atom.quantityType) {
440 case QuantifierFixedCount: {
441 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
442 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
443 return false;
444 }
445 return true;
446 }
447
448 case QuantifierGreedy: {
449 unsigned matchAmount = 0;
450 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
451 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
452 input.uncheckInput(1);
453 break;
454 }
455 ++matchAmount;
456 }
457 backTrack->matchAmount = matchAmount;
458
459 return true;
460 }
461
462 case QuantifierNonGreedy:
463 backTrack->matchAmount = 0;
464 return true;
465 }
466
467 ASSERT_NOT_REACHED();
468 return false;
469 }
470
backtrackCharacterClass(ByteTerm & term,DisjunctionContext * context)471 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
472 {
473 ASSERT(term.type == ByteTerm::TypeCharacterClass);
474 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
475
476 switch (term.atom.quantityType) {
477 case QuantifierFixedCount:
478 break;
479
480 case QuantifierGreedy:
481 if (backTrack->matchAmount) {
482 --backTrack->matchAmount;
483 input.uncheckInput(1);
484 return true;
485 }
486 break;
487
488 case QuantifierNonGreedy:
489 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
490 ++backTrack->matchAmount;
491 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
492 return true;
493 }
494 input.uncheckInput(backTrack->matchAmount);
495 break;
496 }
497
498 return false;
499 }
500
matchBackReference(ByteTerm & term,DisjunctionContext * context)501 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
502 {
503 ASSERT(term.type == ByteTerm::TypeBackReference);
504 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
505
506 int matchBegin = output[(term.atom.subpatternId << 1)];
507 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
508
509 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
510 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
511 // Eg.: /(a\1)/
512 if (matchEnd == -1)
513 return true;
514
515 ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
516
517 if (matchBegin == matchEnd)
518 return true;
519
520 switch (term.atom.quantityType) {
521 case QuantifierFixedCount: {
522 backTrack->begin = input.getPos();
523 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
524 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
525 input.setPos(backTrack->begin);
526 return false;
527 }
528 }
529 return true;
530 }
531
532 case QuantifierGreedy: {
533 unsigned matchAmount = 0;
534 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
535 ++matchAmount;
536 backTrack->matchAmount = matchAmount;
537 return true;
538 }
539
540 case QuantifierNonGreedy:
541 backTrack->begin = input.getPos();
542 backTrack->matchAmount = 0;
543 return true;
544 }
545
546 ASSERT_NOT_REACHED();
547 return false;
548 }
549
backtrackBackReference(ByteTerm & term,DisjunctionContext * context)550 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
551 {
552 ASSERT(term.type == ByteTerm::TypeBackReference);
553 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
554
555 int matchBegin = output[(term.atom.subpatternId << 1)];
556 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
557 ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
558
559 if (matchBegin == matchEnd)
560 return false;
561
562 switch (term.atom.quantityType) {
563 case QuantifierFixedCount:
564 // for quantityCount == 1, could rewind.
565 input.setPos(backTrack->begin);
566 break;
567
568 case QuantifierGreedy:
569 if (backTrack->matchAmount) {
570 --backTrack->matchAmount;
571 input.rewind(matchEnd - matchBegin);
572 return true;
573 }
574 break;
575
576 case QuantifierNonGreedy:
577 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
578 ++backTrack->matchAmount;
579 return true;
580 }
581 input.setPos(backTrack->begin);
582 break;
583 }
584
585 return false;
586 }
587
recordParenthesesMatch(ByteTerm & term,ParenthesesDisjunctionContext * context)588 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
589 {
590 if (term.capture()) {
591 unsigned subpatternId = term.atom.subpatternId;
592 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
593 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
594 }
595 }
resetMatches(ByteTerm & term,ParenthesesDisjunctionContext * context)596 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
597 {
598 unsigned firstSubpatternId = term.atom.subpatternId;
599 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
600 context->restoreOutput(output, firstSubpatternId, count);
601 }
parenthesesDoBacktrack(ByteTerm & term,BackTrackInfoParentheses * backTrack)602 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
603 {
604 while (backTrack->matchAmount) {
605 ParenthesesDisjunctionContext* context = backTrack->lastContext;
606
607 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
608 if (result == JSRegExpMatch)
609 return JSRegExpMatch;
610
611 resetMatches(term, context);
612 popParenthesesDisjunctionContext(backTrack);
613 freeParenthesesDisjunctionContext(context);
614
615 if (result != JSRegExpNoMatch)
616 return result;
617 }
618
619 return JSRegExpNoMatch;
620 }
621
matchParenthesesOnceBegin(ByteTerm & term,DisjunctionContext * context)622 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
623 {
624 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
625 ASSERT(term.atom.quantityCount == 1);
626
627 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
628
629 switch (term.atom.quantityType) {
630 case QuantifierGreedy: {
631 // set this speculatively; if we get to the parens end this will be true.
632 backTrack->begin = input.getPos();
633 break;
634 }
635 case QuantifierNonGreedy: {
636 backTrack->begin = notFound;
637 context->term += term.atom.parenthesesWidth;
638 return true;
639 }
640 case QuantifierFixedCount:
641 break;
642 }
643
644 if (term.capture()) {
645 unsigned subpatternId = term.atom.subpatternId;
646 output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
647 }
648
649 return true;
650 }
651
matchParenthesesOnceEnd(ByteTerm & term,DisjunctionContext * context)652 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
653 {
654 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
655 ASSERT(term.atom.quantityCount == 1);
656
657 if (term.capture()) {
658 unsigned subpatternId = term.atom.subpatternId;
659 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
660 }
661
662 if (term.atom.quantityType == QuantifierFixedCount)
663 return true;
664
665 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
666 return backTrack->begin != input.getPos();
667 }
668
backtrackParenthesesOnceBegin(ByteTerm & term,DisjunctionContext * context)669 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
670 {
671 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
672 ASSERT(term.atom.quantityCount == 1);
673
674 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
675
676 if (term.capture()) {
677 unsigned subpatternId = term.atom.subpatternId;
678 output[(subpatternId << 1)] = -1;
679 output[(subpatternId << 1) + 1] = -1;
680 }
681
682 switch (term.atom.quantityType) {
683 case QuantifierGreedy:
684 // if we backtrack to this point, there is another chance - try matching nothing.
685 ASSERT(backTrack->begin != notFound);
686 backTrack->begin = notFound;
687 context->term += term.atom.parenthesesWidth;
688 return true;
689 case QuantifierNonGreedy:
690 ASSERT(backTrack->begin != notFound);
691 case QuantifierFixedCount:
692 break;
693 }
694
695 return false;
696 }
697
backtrackParenthesesOnceEnd(ByteTerm & term,DisjunctionContext * context)698 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
699 {
700 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
701 ASSERT(term.atom.quantityCount == 1);
702
703 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
704
705 switch (term.atom.quantityType) {
706 case QuantifierGreedy:
707 if (backTrack->begin == notFound) {
708 context->term -= term.atom.parenthesesWidth;
709 return false;
710 }
711 case QuantifierNonGreedy:
712 if (backTrack->begin == notFound) {
713 backTrack->begin = input.getPos();
714 if (term.capture()) {
715 // Technically this access to inputPosition should be accessing the begin term's
716 // inputPosition, but for repeats other than fixed these values should be
717 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
718 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
719 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
720 unsigned subpatternId = term.atom.subpatternId;
721 output[subpatternId << 1] = input.getPos() + term.inputPosition;
722 }
723 context->term -= term.atom.parenthesesWidth;
724 return true;
725 }
726 case QuantifierFixedCount:
727 break;
728 }
729
730 return false;
731 }
732
matchParenthesesTerminalBegin(ByteTerm & term,DisjunctionContext * context)733 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
734 {
735 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
736 ASSERT(term.atom.quantityType == QuantifierGreedy);
737 ASSERT(term.atom.quantityCount == quantifyInfinite);
738 ASSERT(!term.capture());
739
740 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
741 backTrack->begin = input.getPos();
742 return true;
743 }
744
matchParenthesesTerminalEnd(ByteTerm & term,DisjunctionContext * context)745 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
746 {
747 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
748
749 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
750 // Empty match is a failed match.
751 if (backTrack->begin == input.getPos())
752 return false;
753
754 // Successful match! Okay, what's next? - loop around and try to match moar!
755 context->term -= (term.atom.parenthesesWidth + 1);
756 return true;
757 }
758
backtrackParenthesesTerminalBegin(ByteTerm & term,DisjunctionContext * context)759 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
760 {
761 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
762 ASSERT(term.atom.quantityType == QuantifierGreedy);
763 ASSERT(term.atom.quantityCount == quantifyInfinite);
764 ASSERT(!term.capture());
765
766 // If we backtrack to this point, we have failed to match this iteration of the parens.
767 // Since this is greedy / zero minimum a failed is also accepted as a match!
768 context->term += term.atom.parenthesesWidth;
769 return true;
770 }
771
backtrackParenthesesTerminalEnd(ByteTerm &,DisjunctionContext *)772 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
773 {
774 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
775 // should always be returned as a successful match - we should never backtrack to here.
776 ASSERT_NOT_REACHED();
777 return false;
778 }
779
matchParentheticalAssertionBegin(ByteTerm & term,DisjunctionContext * context)780 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
781 {
782 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
783 ASSERT(term.atom.quantityCount == 1);
784
785 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
786
787 backTrack->begin = input.getPos();
788 return true;
789 }
790
matchParentheticalAssertionEnd(ByteTerm & term,DisjunctionContext * context)791 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
792 {
793 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
794 ASSERT(term.atom.quantityCount == 1);
795
796 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
797
798 input.setPos(backTrack->begin);
799
800 // We've reached the end of the parens; if they are inverted, this is failure.
801 if (term.invert()) {
802 context->term -= term.atom.parenthesesWidth;
803 return false;
804 }
805
806 return true;
807 }
808
backtrackParentheticalAssertionBegin(ByteTerm & term,DisjunctionContext * context)809 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
810 {
811 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
812 ASSERT(term.atom.quantityCount == 1);
813
814 // We've failed to match parens; if they are inverted, this is win!
815 if (term.invert()) {
816 context->term += term.atom.parenthesesWidth;
817 return true;
818 }
819
820 return false;
821 }
822
backtrackParentheticalAssertionEnd(ByteTerm & term,DisjunctionContext * context)823 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
824 {
825 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
826 ASSERT(term.atom.quantityCount == 1);
827
828 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
829
830 input.setPos(backTrack->begin);
831
832 context->term -= term.atom.parenthesesWidth;
833 return false;
834 }
835
matchParentheses(ByteTerm & term,DisjunctionContext * context)836 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
837 {
838 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
839
840 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
841 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
842
843 backTrack->matchAmount = 0;
844 backTrack->lastContext = 0;
845
846 switch (term.atom.quantityType) {
847 case QuantifierFixedCount: {
848 // While we haven't yet reached our fixed limit,
849 while (backTrack->matchAmount < term.atom.quantityCount) {
850 // Try to do a match, and it it succeeds, add it to the list.
851 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
852 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
853 if (result == JSRegExpMatch)
854 appendParenthesesDisjunctionContext(backTrack, context);
855 else {
856 // The match failed; try to find an alternate point to carry on from.
857 resetMatches(term, context);
858 freeParenthesesDisjunctionContext(context);
859
860 if (result == JSRegExpNoMatch) {
861 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
862 if (backtrackResult != JSRegExpMatch)
863 return backtrackResult;
864 } else
865 return result;
866 }
867 }
868
869 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
870 ParenthesesDisjunctionContext* context = backTrack->lastContext;
871 recordParenthesesMatch(term, context);
872 return JSRegExpMatch;
873 }
874
875 case QuantifierGreedy: {
876 while (backTrack->matchAmount < term.atom.quantityCount) {
877 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
878 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
879 if (result == JSRegExpMatch)
880 appendParenthesesDisjunctionContext(backTrack, context);
881 else {
882 resetMatches(term, context);
883 freeParenthesesDisjunctionContext(context);
884
885 if (result != JSRegExpNoMatch)
886 return result;
887
888 break;
889 }
890 }
891
892 if (backTrack->matchAmount) {
893 ParenthesesDisjunctionContext* context = backTrack->lastContext;
894 recordParenthesesMatch(term, context);
895 }
896 return JSRegExpMatch;
897 }
898
899 case QuantifierNonGreedy:
900 return JSRegExpMatch;
901 }
902
903 ASSERT_NOT_REACHED();
904 return JSRegExpErrorNoMatch;
905 }
906
907 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
908 //
909 // Greedy matches never should try just adding more - you should already have done
910 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
911 // you backtrack an item off the list needs checking, since we'll never have matched
912 // the one less case. Tracking forwards, still add as much as possible.
913 //
914 // Non-greedy, we've already done the one less case, so don't match on popping.
915 // We haven't done the one more case, so always try to add that.
916 //
backtrackParentheses(ByteTerm & term,DisjunctionContext * context)917 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
918 {
919 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
920
921 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
922 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
923
924 switch (term.atom.quantityType) {
925 case QuantifierFixedCount: {
926 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
927
928 ParenthesesDisjunctionContext* context = 0;
929 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
930
931 if (result != JSRegExpMatch)
932 return result;
933
934 // While we haven't yet reached our fixed limit,
935 while (backTrack->matchAmount < term.atom.quantityCount) {
936 // Try to do a match, and it it succeeds, add it to the list.
937 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
938 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
939
940 if (result == JSRegExpMatch)
941 appendParenthesesDisjunctionContext(backTrack, context);
942 else {
943 // The match failed; try to find an alternate point to carry on from.
944 resetMatches(term, context);
945 freeParenthesesDisjunctionContext(context);
946
947 if (result == JSRegExpNoMatch) {
948 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
949 if (backtrackResult != JSRegExpMatch)
950 return backtrackResult;
951 } else
952 return result;
953 }
954 }
955
956 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
957 context = backTrack->lastContext;
958 recordParenthesesMatch(term, context);
959 return JSRegExpMatch;
960 }
961
962 case QuantifierGreedy: {
963 if (!backTrack->matchAmount)
964 return JSRegExpNoMatch;
965
966 ParenthesesDisjunctionContext* context = backTrack->lastContext;
967 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
968 if (result == JSRegExpMatch) {
969 while (backTrack->matchAmount < term.atom.quantityCount) {
970 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
971 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
972 if (parenthesesResult == JSRegExpMatch)
973 appendParenthesesDisjunctionContext(backTrack, context);
974 else {
975 resetMatches(term, context);
976 freeParenthesesDisjunctionContext(context);
977
978 if (parenthesesResult != JSRegExpNoMatch)
979 return parenthesesResult;
980
981 break;
982 }
983 }
984 } else {
985 resetMatches(term, context);
986 popParenthesesDisjunctionContext(backTrack);
987 freeParenthesesDisjunctionContext(context);
988
989 if (result != JSRegExpNoMatch)
990 return result;
991 }
992
993 if (backTrack->matchAmount) {
994 ParenthesesDisjunctionContext* context = backTrack->lastContext;
995 recordParenthesesMatch(term, context);
996 }
997 return JSRegExpMatch;
998 }
999
1000 case QuantifierNonGreedy: {
1001 // If we've not reached the limit, try to add one more match.
1002 if (backTrack->matchAmount < term.atom.quantityCount) {
1003 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1004 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1005 if (result == JSRegExpMatch) {
1006 appendParenthesesDisjunctionContext(backTrack, context);
1007 recordParenthesesMatch(term, context);
1008 return JSRegExpMatch;
1009 }
1010
1011 resetMatches(term, context);
1012 freeParenthesesDisjunctionContext(context);
1013
1014 if (result != JSRegExpNoMatch)
1015 return result;
1016 }
1017
1018 // Nope - okay backtrack looking for an alternative.
1019 while (backTrack->matchAmount) {
1020 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1021 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1022 if (result == JSRegExpMatch) {
1023 // successful backtrack! we're back in the game!
1024 if (backTrack->matchAmount) {
1025 context = backTrack->lastContext;
1026 recordParenthesesMatch(term, context);
1027 }
1028 return JSRegExpMatch;
1029 }
1030
1031 // pop a match off the stack
1032 resetMatches(term, context);
1033 popParenthesesDisjunctionContext(backTrack);
1034 freeParenthesesDisjunctionContext(context);
1035
1036 return result;
1037 }
1038
1039 return JSRegExpNoMatch;
1040 }
1041 }
1042
1043 ASSERT_NOT_REACHED();
1044 return JSRegExpErrorNoMatch;
1045 }
1046
lookupForBeginChars()1047 void lookupForBeginChars()
1048 {
1049 int character;
1050 bool firstSingleCharFound;
1051
1052 while (true) {
1053 if (input.isNotAvailableInput(2))
1054 return;
1055
1056 firstSingleCharFound = false;
1057
1058 character = input.readPair();
1059
1060 for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
1061 BeginChar bc = pattern->m_beginChars[i];
1062
1063 if (!firstSingleCharFound && bc.value <= 0xFFFF) {
1064 firstSingleCharFound = true;
1065 character &= 0xFFFF;
1066 }
1067
1068 if ((character | bc.mask) == bc.value)
1069 return;
1070 }
1071
1072 input.next();
1073 }
1074 }
1075
1076 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
1077 #define BACKTRACK() { --context->term; goto backtrack; }
1078 #define currentTerm() (disjunction->terms[context->term])
matchDisjunction(ByteDisjunction * disjunction,DisjunctionContext * context,bool btrack=false,bool isBody=false)1079 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
1080 {
1081 if (!--remainingMatchCount)
1082 return JSRegExpErrorHitLimit;
1083
1084 if (btrack)
1085 BACKTRACK();
1086
1087 if (pattern->m_containsBeginChars && isBody)
1088 lookupForBeginChars();
1089
1090 context->matchBegin = input.getPos();
1091 context->term = 0;
1092
1093 matchAgain:
1094 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1095
1096 switch (currentTerm().type) {
1097 case ByteTerm::TypeSubpatternBegin:
1098 MATCH_NEXT();
1099 case ByteTerm::TypeSubpatternEnd:
1100 context->matchEnd = input.getPos();
1101 return JSRegExpMatch;
1102
1103 case ByteTerm::TypeBodyAlternativeBegin:
1104 MATCH_NEXT();
1105 case ByteTerm::TypeBodyAlternativeDisjunction:
1106 case ByteTerm::TypeBodyAlternativeEnd:
1107 context->matchEnd = input.getPos();
1108 return JSRegExpMatch;
1109
1110 case ByteTerm::TypeAlternativeBegin:
1111 MATCH_NEXT();
1112 case ByteTerm::TypeAlternativeDisjunction:
1113 case ByteTerm::TypeAlternativeEnd: {
1114 int offset = currentTerm().alternative.end;
1115 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1116 backTrack->offset = offset;
1117 context->term += offset;
1118 MATCH_NEXT();
1119 }
1120
1121 case ByteTerm::TypeAssertionBOL:
1122 if (matchAssertionBOL(currentTerm()))
1123 MATCH_NEXT();
1124 BACKTRACK();
1125 case ByteTerm::TypeAssertionEOL:
1126 if (matchAssertionEOL(currentTerm()))
1127 MATCH_NEXT();
1128 BACKTRACK();
1129 case ByteTerm::TypeAssertionWordBoundary:
1130 if (matchAssertionWordBoundary(currentTerm()))
1131 MATCH_NEXT();
1132 BACKTRACK();
1133
1134 case ByteTerm::TypePatternCharacterOnce:
1135 case ByteTerm::TypePatternCharacterFixed: {
1136 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1137 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1138 BACKTRACK();
1139 }
1140 MATCH_NEXT();
1141 }
1142 case ByteTerm::TypePatternCharacterGreedy: {
1143 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1144 unsigned matchAmount = 0;
1145 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1146 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1147 input.uncheckInput(1);
1148 break;
1149 }
1150 ++matchAmount;
1151 }
1152 backTrack->matchAmount = matchAmount;
1153
1154 MATCH_NEXT();
1155 }
1156 case ByteTerm::TypePatternCharacterNonGreedy: {
1157 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1158 backTrack->matchAmount = 0;
1159 MATCH_NEXT();
1160 }
1161
1162 case ByteTerm::TypePatternCasedCharacterOnce:
1163 case ByteTerm::TypePatternCasedCharacterFixed: {
1164 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1165 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1166 BACKTRACK();
1167 }
1168 MATCH_NEXT();
1169 }
1170 case ByteTerm::TypePatternCasedCharacterGreedy: {
1171 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1172 unsigned matchAmount = 0;
1173 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1174 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1175 input.uncheckInput(1);
1176 break;
1177 }
1178 ++matchAmount;
1179 }
1180 backTrack->matchAmount = matchAmount;
1181
1182 MATCH_NEXT();
1183 }
1184 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1185 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1186 backTrack->matchAmount = 0;
1187 MATCH_NEXT();
1188 }
1189
1190 case ByteTerm::TypeCharacterClass:
1191 if (matchCharacterClass(currentTerm(), context))
1192 MATCH_NEXT();
1193 BACKTRACK();
1194 case ByteTerm::TypeBackReference:
1195 if (matchBackReference(currentTerm(), context))
1196 MATCH_NEXT();
1197 BACKTRACK();
1198 case ByteTerm::TypeParenthesesSubpattern: {
1199 JSRegExpResult result = matchParentheses(currentTerm(), context);
1200
1201 if (result == JSRegExpMatch) {
1202 MATCH_NEXT();
1203 } else if (result != JSRegExpNoMatch)
1204 return result;
1205
1206 BACKTRACK();
1207 }
1208 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1209 if (matchParenthesesOnceBegin(currentTerm(), context))
1210 MATCH_NEXT();
1211 BACKTRACK();
1212 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1213 if (matchParenthesesOnceEnd(currentTerm(), context))
1214 MATCH_NEXT();
1215 BACKTRACK();
1216 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1217 if (matchParenthesesTerminalBegin(currentTerm(), context))
1218 MATCH_NEXT();
1219 BACKTRACK();
1220 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1221 if (matchParenthesesTerminalEnd(currentTerm(), context))
1222 MATCH_NEXT();
1223 BACKTRACK();
1224 case ByteTerm::TypeParentheticalAssertionBegin:
1225 if (matchParentheticalAssertionBegin(currentTerm(), context))
1226 MATCH_NEXT();
1227 BACKTRACK();
1228 case ByteTerm::TypeParentheticalAssertionEnd:
1229 if (matchParentheticalAssertionEnd(currentTerm(), context))
1230 MATCH_NEXT();
1231 BACKTRACK();
1232
1233 case ByteTerm::TypeCheckInput:
1234 if (input.checkInput(currentTerm().checkInputCount))
1235 MATCH_NEXT();
1236 BACKTRACK();
1237
1238 case ByteTerm::TypeUncheckInput:
1239 input.uncheckInput(currentTerm().checkInputCount);
1240 MATCH_NEXT();
1241 }
1242
1243 // We should never fall-through to here.
1244 ASSERT_NOT_REACHED();
1245
1246 backtrack:
1247 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1248
1249 switch (currentTerm().type) {
1250 case ByteTerm::TypeSubpatternBegin:
1251 return JSRegExpNoMatch;
1252 case ByteTerm::TypeSubpatternEnd:
1253 ASSERT_NOT_REACHED();
1254
1255 case ByteTerm::TypeBodyAlternativeBegin:
1256 case ByteTerm::TypeBodyAlternativeDisjunction: {
1257 int offset = currentTerm().alternative.next;
1258 context->term += offset;
1259 if (offset > 0)
1260 MATCH_NEXT();
1261
1262 if (input.atEnd())
1263 return JSRegExpNoMatch;
1264
1265 input.next();
1266
1267 if (pattern->m_containsBeginChars && isBody)
1268 lookupForBeginChars();
1269
1270 context->matchBegin = input.getPos();
1271
1272 if (currentTerm().alternative.onceThrough)
1273 context->term += currentTerm().alternative.next;
1274
1275 MATCH_NEXT();
1276 }
1277 case ByteTerm::TypeBodyAlternativeEnd:
1278 ASSERT_NOT_REACHED();
1279
1280 case ByteTerm::TypeAlternativeBegin:
1281 case ByteTerm::TypeAlternativeDisjunction: {
1282 int offset = currentTerm().alternative.next;
1283 context->term += offset;
1284 if (offset > 0)
1285 MATCH_NEXT();
1286 BACKTRACK();
1287 }
1288 case ByteTerm::TypeAlternativeEnd: {
1289 // We should never backtrack back into an alternative of the main body of the regex.
1290 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1291 unsigned offset = backTrack->offset;
1292 context->term -= offset;
1293 BACKTRACK();
1294 }
1295
1296 case ByteTerm::TypeAssertionBOL:
1297 case ByteTerm::TypeAssertionEOL:
1298 case ByteTerm::TypeAssertionWordBoundary:
1299 BACKTRACK();
1300
1301 case ByteTerm::TypePatternCharacterOnce:
1302 case ByteTerm::TypePatternCharacterFixed:
1303 case ByteTerm::TypePatternCharacterGreedy:
1304 case ByteTerm::TypePatternCharacterNonGreedy:
1305 if (backtrackPatternCharacter(currentTerm(), context))
1306 MATCH_NEXT();
1307 BACKTRACK();
1308 case ByteTerm::TypePatternCasedCharacterOnce:
1309 case ByteTerm::TypePatternCasedCharacterFixed:
1310 case ByteTerm::TypePatternCasedCharacterGreedy:
1311 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1312 if (backtrackPatternCasedCharacter(currentTerm(), context))
1313 MATCH_NEXT();
1314 BACKTRACK();
1315 case ByteTerm::TypeCharacterClass:
1316 if (backtrackCharacterClass(currentTerm(), context))
1317 MATCH_NEXT();
1318 BACKTRACK();
1319 case ByteTerm::TypeBackReference:
1320 if (backtrackBackReference(currentTerm(), context))
1321 MATCH_NEXT();
1322 BACKTRACK();
1323 case ByteTerm::TypeParenthesesSubpattern: {
1324 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1325
1326 if (result == JSRegExpMatch) {
1327 MATCH_NEXT();
1328 } else if (result != JSRegExpNoMatch)
1329 return result;
1330
1331 BACKTRACK();
1332 }
1333 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1334 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1335 MATCH_NEXT();
1336 BACKTRACK();
1337 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1338 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1339 MATCH_NEXT();
1340 BACKTRACK();
1341 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1342 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1343 MATCH_NEXT();
1344 BACKTRACK();
1345 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1346 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1347 MATCH_NEXT();
1348 BACKTRACK();
1349 case ByteTerm::TypeParentheticalAssertionBegin:
1350 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1351 MATCH_NEXT();
1352 BACKTRACK();
1353 case ByteTerm::TypeParentheticalAssertionEnd:
1354 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1355 MATCH_NEXT();
1356 BACKTRACK();
1357
1358 case ByteTerm::TypeCheckInput:
1359 input.uncheckInput(currentTerm().checkInputCount);
1360 BACKTRACK();
1361
1362 case ByteTerm::TypeUncheckInput:
1363 input.checkInput(currentTerm().checkInputCount);
1364 BACKTRACK();
1365 }
1366
1367 ASSERT_NOT_REACHED();
1368 return JSRegExpErrorNoMatch;
1369 }
1370
matchNonZeroDisjunction(ByteDisjunction * disjunction,DisjunctionContext * context,bool btrack=false)1371 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1372 {
1373 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1374
1375 if (result == JSRegExpMatch) {
1376 while (context->matchBegin == context->matchEnd) {
1377 result = matchDisjunction(disjunction, context, true);
1378 if (result != JSRegExpMatch)
1379 return result;
1380 }
1381 return JSRegExpMatch;
1382 }
1383
1384 return result;
1385 }
1386
interpret()1387 int interpret()
1388 {
1389 allocatorPool = pattern->m_allocator->startAllocator();
1390 if (!allocatorPool)
1391 CRASH();
1392
1393 for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1394 output[i] = -1;
1395
1396 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1397
1398 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
1399 if (result == JSRegExpMatch) {
1400 output[0] = context->matchBegin;
1401 output[1] = context->matchEnd;
1402 }
1403
1404 freeDisjunctionContext(context);
1405
1406 pattern->m_allocator->stopAllocator();
1407
1408 // RegExp.cpp currently expects all error to be converted to -1.
1409 ASSERT((result == JSRegExpMatch) == (output[0] != -1));
1410 return output[0];
1411 }
1412
Interpreter(BytecodePattern * pattern,int * output,const UChar * inputChar,unsigned start,unsigned length)1413 Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1414 : pattern(pattern)
1415 , output(output)
1416 , input(inputChar, start, length)
1417 , allocatorPool(0)
1418 , remainingMatchCount(matchLimit)
1419 {
1420 }
1421
1422 private:
1423 BytecodePattern* pattern;
1424 int* output;
1425 InputStream input;
1426 BumpPointerPool* allocatorPool;
1427 unsigned remainingMatchCount;
1428 };
1429
1430
1431
1432 class ByteCompiler {
1433 struct ParenthesesStackEntry {
1434 unsigned beginTerm;
1435 unsigned savedAlternativeIndex;
ParenthesesStackEntryJSC::Yarr::ByteCompiler::ParenthesesStackEntry1436 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1437 : beginTerm(beginTerm)
1438 , savedAlternativeIndex(savedAlternativeIndex)
1439 {
1440 }
1441 };
1442
1443 public:
ByteCompiler(YarrPattern & pattern)1444 ByteCompiler(YarrPattern& pattern)
1445 : m_pattern(pattern)
1446 {
1447 m_currentAlternativeIndex = 0;
1448 }
1449
compile(BumpPointerAllocator * allocator)1450 PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1451 {
1452 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1453 emitDisjunction(m_pattern.m_body);
1454 regexEnd();
1455
1456 return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1457 }
1458
checkInput(unsigned count)1459 void checkInput(unsigned count)
1460 {
1461 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1462 }
1463
uncheckInput(unsigned count)1464 void uncheckInput(unsigned count)
1465 {
1466 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1467 }
1468
assertionBOL(int inputPosition)1469 void assertionBOL(int inputPosition)
1470 {
1471 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1472 }
1473
assertionEOL(int inputPosition)1474 void assertionEOL(int inputPosition)
1475 {
1476 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1477 }
1478
assertionWordBoundary(bool invert,int inputPosition)1479 void assertionWordBoundary(bool invert, int inputPosition)
1480 {
1481 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1482 }
1483
atomPatternCharacter(UChar ch,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1484 void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1485 {
1486 if (m_pattern.m_ignoreCase) {
1487 UChar lo = Unicode::toLower(ch);
1488 UChar hi = Unicode::toUpper(ch);
1489
1490 if (lo != hi) {
1491 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1492 return;
1493 }
1494 }
1495
1496 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1497 }
1498
atomCharacterClass(CharacterClass * characterClass,bool invert,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1499 void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1500 {
1501 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1502
1503 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1504 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1505 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1506 }
1507
atomBackReference(unsigned subpatternId,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1508 void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1509 {
1510 ASSERT(subpatternId);
1511
1512 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1513
1514 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1515 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1516 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1517 }
1518
atomParenthesesOnceBegin(unsigned subpatternId,bool capture,int inputPosition,unsigned frameLocation,unsigned alternativeFrameLocation)1519 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1520 {
1521 int beginTerm = m_bodyDisjunction->terms.size();
1522
1523 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1524 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1525 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1526 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1527
1528 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1529 m_currentAlternativeIndex = beginTerm + 1;
1530 }
1531
atomParenthesesTerminalBegin(unsigned subpatternId,bool capture,int inputPosition,unsigned frameLocation,unsigned alternativeFrameLocation)1532 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1533 {
1534 int beginTerm = m_bodyDisjunction->terms.size();
1535
1536 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1537 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1538 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1539 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1540
1541 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1542 m_currentAlternativeIndex = beginTerm + 1;
1543 }
1544
atomParenthesesSubpatternBegin(unsigned subpatternId,bool capture,int inputPosition,unsigned frameLocation,unsigned alternativeFrameLocation)1545 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1546 {
1547 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1548 // then fix this up at the end! - simplifying this should make it much clearer.
1549 // https://bugs.webkit.org/show_bug.cgi?id=50136
1550
1551 int beginTerm = m_bodyDisjunction->terms.size();
1552
1553 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1554 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1555 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1556 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1557
1558 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1559 m_currentAlternativeIndex = beginTerm + 1;
1560 }
1561
atomParentheticalAssertionBegin(unsigned subpatternId,bool invert,unsigned frameLocation,unsigned alternativeFrameLocation)1562 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1563 {
1564 int beginTerm = m_bodyDisjunction->terms.size();
1565
1566 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1567 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1568 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1569 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1570
1571 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1572 m_currentAlternativeIndex = beginTerm + 1;
1573 }
1574
atomParentheticalAssertionEnd(int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1575 void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1576 {
1577 unsigned beginTerm = popParenthesesStack();
1578 closeAlternative(beginTerm + 1);
1579 unsigned endTerm = m_bodyDisjunction->terms.size();
1580
1581 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1582
1583 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1584 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1585
1586 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1587 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1588 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1589 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1590
1591 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1592 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1593 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1594 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1595 }
1596
popParenthesesStack()1597 unsigned popParenthesesStack()
1598 {
1599 ASSERT(m_parenthesesStack.size());
1600 int stackEnd = m_parenthesesStack.size() - 1;
1601 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1602 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1603 m_parenthesesStack.shrink(stackEnd);
1604
1605 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1606 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1607
1608 return beginTerm;
1609 }
1610
1611 #ifndef NDEBUG
dumpDisjunction(ByteDisjunction * disjunction)1612 void dumpDisjunction(ByteDisjunction* disjunction)
1613 {
1614 printf("ByteDisjunction(%p):\n\t", disjunction);
1615 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1616 printf("{ %d } ", disjunction->terms[i].type);
1617 printf("\n");
1618 }
1619 #endif
1620
closeAlternative(int beginTerm)1621 void closeAlternative(int beginTerm)
1622 {
1623 int origBeginTerm = beginTerm;
1624 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1625 int endIndex = m_bodyDisjunction->terms.size();
1626
1627 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1628
1629 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1630 m_bodyDisjunction->terms.remove(beginTerm);
1631 else {
1632 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1633 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1634 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1635 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1636 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1637 }
1638
1639 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1640
1641 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1642 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1643 }
1644 }
1645
closeBodyAlternative()1646 void closeBodyAlternative()
1647 {
1648 int beginTerm = 0;
1649 int origBeginTerm = 0;
1650 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1651 int endIndex = m_bodyDisjunction->terms.size();
1652
1653 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1654
1655 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1656 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1657 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1658 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1659 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1660 }
1661
1662 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1663
1664 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1665 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1666 }
1667
atomParenthesesSubpatternEnd(unsigned lastSubpatternId,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType,unsigned callFrameSize=0)1668 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1669 {
1670 unsigned beginTerm = popParenthesesStack();
1671 closeAlternative(beginTerm + 1);
1672 unsigned endTerm = m_bodyDisjunction->terms.size();
1673
1674 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1675
1676 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1677
1678 bool capture = parenthesesBegin.capture();
1679 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1680
1681 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1682 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1683
1684 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1685 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1686 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1687 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1688
1689 m_bodyDisjunction->terms.shrink(beginTerm);
1690
1691 m_allParenthesesInfo.append(parenthesesDisjunction);
1692 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
1693
1694 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1695 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1696 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1697 }
1698
atomParenthesesOnceEnd(int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1699 void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1700 {
1701 unsigned beginTerm = popParenthesesStack();
1702 closeAlternative(beginTerm + 1);
1703 unsigned endTerm = m_bodyDisjunction->terms.size();
1704
1705 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1706
1707 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1708 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1709
1710 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1711 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1712 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1713 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1714
1715 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1716 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1717 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1718 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1719 }
1720
atomParenthesesTerminalEnd(int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1721 void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1722 {
1723 unsigned beginTerm = popParenthesesStack();
1724 closeAlternative(beginTerm + 1);
1725 unsigned endTerm = m_bodyDisjunction->terms.size();
1726
1727 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1728
1729 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1730 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1731
1732 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1733 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1734 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1735 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1736
1737 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1738 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1739 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1740 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1741 }
1742
regexBegin(unsigned numSubpatterns,unsigned callFrameSize,bool onceThrough)1743 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1744 {
1745 m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1746 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1747 m_bodyDisjunction->terms[0].frameLocation = 0;
1748 m_currentAlternativeIndex = 0;
1749 }
1750
regexEnd()1751 void regexEnd()
1752 {
1753 closeBodyAlternative();
1754 }
1755
alternativeBodyDisjunction(bool onceThrough)1756 void alternativeBodyDisjunction(bool onceThrough)
1757 {
1758 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1759 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1760 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1761
1762 m_currentAlternativeIndex = newAlternativeIndex;
1763 }
1764
alternativeDisjunction()1765 void alternativeDisjunction()
1766 {
1767 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1768 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1769 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1770
1771 m_currentAlternativeIndex = newAlternativeIndex;
1772 }
1773
emitDisjunction(PatternDisjunction * disjunction,unsigned inputCountAlreadyChecked=0,unsigned parenthesesInputCountAlreadyChecked=0,bool isParentheticalAssertion=false)1774 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false)
1775 {
1776 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1777 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1778
1779 PatternAlternative* alternative = disjunction->m_alternatives[alt];
1780
1781 if (alt) {
1782 if (disjunction == m_pattern.m_body)
1783 alternativeBodyDisjunction(alternative->onceThrough());
1784 else
1785 alternativeDisjunction();
1786 }
1787
1788 unsigned minimumSize = alternative->m_minimumSize;
1789 int countToCheck;
1790
1791 if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize)
1792 countToCheck = 0;
1793 else
1794 countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1795
1796 ASSERT(countToCheck >= 0);
1797 if (countToCheck) {
1798 checkInput(countToCheck);
1799 currentCountAlreadyChecked += countToCheck;
1800 }
1801
1802 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1803 PatternTerm& term = alternative->m_terms[i];
1804
1805 switch (term.type) {
1806 case PatternTerm::TypeAssertionBOL:
1807 assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1808 break;
1809
1810 case PatternTerm::TypeAssertionEOL:
1811 assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1812 break;
1813
1814 case PatternTerm::TypeAssertionWordBoundary:
1815 assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked);
1816 break;
1817
1818 case PatternTerm::TypePatternCharacter:
1819 atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1820 break;
1821
1822 case PatternTerm::TypeCharacterClass:
1823 atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1824 break;
1825
1826 case PatternTerm::TypeBackReference:
1827 atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1828 break;
1829
1830 case PatternTerm::TypeForwardReference:
1831 break;
1832
1833 case PatternTerm::TypeParenthesesSubpattern: {
1834 unsigned disjunctionAlreadyCheckedCount = 0;
1835 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
1836 unsigned alternativeFrameLocation = term.frameLocation;
1837 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
1838 if (term.quantityType == QuantifierFixedCount)
1839 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1840 else
1841 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1842 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1843 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation);
1844 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1845 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1846 } else if (term.parentheses.isTerminal) {
1847 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1848 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
1849 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1850 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1851 } else {
1852 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1853 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1854 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1855 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1856 }
1857 break;
1858 }
1859
1860 case PatternTerm::TypeParentheticalAssertion: {
1861 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
1862
1863 ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
1864 int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
1865 int uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
1866
1867 if (uncheckAmount > 0) {
1868 uncheckInput(uncheckAmount);
1869 currentCountAlreadyChecked -= uncheckAmount;
1870 } else
1871 uncheckAmount = 0;
1872
1873 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
1874 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
1875 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
1876 if (uncheckAmount) {
1877 checkInput(uncheckAmount);
1878 currentCountAlreadyChecked += uncheckAmount;
1879 }
1880 break;
1881 }
1882 }
1883 }
1884 }
1885 }
1886
1887 private:
1888 YarrPattern& m_pattern;
1889 OwnPtr<ByteDisjunction> m_bodyDisjunction;
1890 unsigned m_currentAlternativeIndex;
1891 Vector<ParenthesesStackEntry> m_parenthesesStack;
1892 Vector<ByteDisjunction*> m_allParenthesesInfo;
1893 };
1894
byteCompile(YarrPattern & pattern,BumpPointerAllocator * allocator)1895 PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
1896 {
1897 return ByteCompiler(pattern).compile(allocator);
1898 }
1899
interpret(BytecodePattern * bytecode,const UChar * input,unsigned start,unsigned length,int * output)1900 int interpret(BytecodePattern* bytecode, const UChar* input, unsigned start, unsigned length, int* output)
1901 {
1902 return Interpreter(bytecode, output, input, start, length).interpret();
1903 }
1904
1905 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
1906 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
1907 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
1908 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
1909 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
1910 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
1911 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
1912
1913
1914 } }
1915