• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "YarrInterpreter.h"
29 
30 #include "Yarr.h"
31 #include <wtf/BumpPointerAllocator.h>
32 
33 #ifndef NDEBUG
34 #include <stdio.h>
35 #endif
36 
37 using namespace WTF;
38 
39 namespace JSC { namespace Yarr {
40 
41 class Interpreter {
42 public:
43     struct ParenthesesDisjunctionContext;
44 
45     struct BackTrackInfoPatternCharacter {
46         uintptr_t matchAmount;
47     };
48     struct BackTrackInfoCharacterClass {
49         uintptr_t matchAmount;
50     };
51     struct BackTrackInfoBackReference {
52         uintptr_t begin; // Not really needed for greedy quantifiers.
53         uintptr_t matchAmount; // Not really needed for fixed quantifiers.
54     };
55     struct BackTrackInfoAlternative {
56         uintptr_t offset;
57     };
58     struct BackTrackInfoParentheticalAssertion {
59         uintptr_t begin;
60     };
61     struct BackTrackInfoParenthesesOnce {
62         uintptr_t begin;
63     };
64     struct BackTrackInfoParenthesesTerminal {
65         uintptr_t begin;
66     };
67     struct BackTrackInfoParentheses {
68         uintptr_t matchAmount;
69         ParenthesesDisjunctionContext* lastContext;
70     };
71 
appendParenthesesDisjunctionContext(BackTrackInfoParentheses * backTrack,ParenthesesDisjunctionContext * context)72     static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
73     {
74         context->next = backTrack->lastContext;
75         backTrack->lastContext = context;
76         ++backTrack->matchAmount;
77     }
78 
popParenthesesDisjunctionContext(BackTrackInfoParentheses * backTrack)79     static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
80     {
81         ASSERT(backTrack->matchAmount);
82         ASSERT(backTrack->lastContext);
83         backTrack->lastContext = backTrack->lastContext->next;
84         --backTrack->matchAmount;
85     }
86 
87     struct DisjunctionContext
88     {
DisjunctionContextJSC::Yarr::Interpreter::DisjunctionContext89         DisjunctionContext()
90             : term(0)
91         {
92         }
93 
operator newJSC::Yarr::Interpreter::DisjunctionContext94         void* operator new(size_t, void* where)
95         {
96             return where;
97         }
98 
99         int term;
100         unsigned matchBegin;
101         unsigned matchEnd;
102         uintptr_t frame[1];
103     };
104 
allocDisjunctionContext(ByteDisjunction * disjunction)105     DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
106     {
107         size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
108         allocatorPool = allocatorPool->ensureCapacity(size);
109         if (!allocatorPool)
110             CRASH();
111         return new(allocatorPool->alloc(size)) DisjunctionContext();
112     }
113 
freeDisjunctionContext(DisjunctionContext * context)114     void freeDisjunctionContext(DisjunctionContext* context)
115     {
116         allocatorPool = allocatorPool->dealloc(context);
117     }
118 
119     struct ParenthesesDisjunctionContext
120     {
ParenthesesDisjunctionContextJSC::Yarr::Interpreter::ParenthesesDisjunctionContext121         ParenthesesDisjunctionContext(int* output, ByteTerm& term)
122             : next(0)
123         {
124             unsigned firstSubpatternId = term.atom.subpatternId;
125             unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
126 
127             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
128                 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
129                 output[(firstSubpatternId << 1) + i] = -1;
130             }
131 
132             new(getDisjunctionContext(term)) DisjunctionContext();
133         }
134 
operator newJSC::Yarr::Interpreter::ParenthesesDisjunctionContext135         void* operator new(size_t, void* where)
136         {
137             return where;
138         }
139 
restoreOutputJSC::Yarr::Interpreter::ParenthesesDisjunctionContext140         void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
141         {
142             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
143                 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
144         }
145 
getDisjunctionContextJSC::Yarr::Interpreter::ParenthesesDisjunctionContext146         DisjunctionContext* getDisjunctionContext(ByteTerm& term)
147         {
148             return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
149         }
150 
151         ParenthesesDisjunctionContext* next;
152         int subpatternBackup[1];
153     };
154 
allocParenthesesDisjunctionContext(ByteDisjunction * disjunction,int * output,ByteTerm & term)155     ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
156     {
157         size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
158         allocatorPool = allocatorPool->ensureCapacity(size);
159         if (!allocatorPool)
160             CRASH();
161         return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
162     }
163 
freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext * context)164     void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
165     {
166         allocatorPool = allocatorPool->dealloc(context);
167     }
168 
169     class InputStream {
170     public:
InputStream(const UChar * input,unsigned start,unsigned length)171         InputStream(const UChar* input, unsigned start, unsigned length)
172             : input(input)
173             , pos(start)
174             , length(length)
175         {
176         }
177 
next()178         void next()
179         {
180             ++pos;
181         }
182 
rewind(unsigned amount)183         void rewind(unsigned amount)
184         {
185             ASSERT(pos >= amount);
186             pos -= amount;
187         }
188 
read()189         int read()
190         {
191             ASSERT(pos < length);
192             if (pos < length)
193                 return input[pos];
194             return -1;
195         }
196 
readPair()197         int readPair()
198         {
199             ASSERT(pos + 1 < length);
200             return input[pos] | input[pos + 1] << 16;
201         }
202 
readChecked(int position)203         int readChecked(int position)
204         {
205             ASSERT(position < 0);
206             ASSERT(static_cast<unsigned>(-position) <= pos);
207             unsigned p = pos + position;
208             ASSERT(p < length);
209             return input[p];
210         }
211 
reread(unsigned from)212         int reread(unsigned from)
213         {
214             ASSERT(from < length);
215             return input[from];
216         }
217 
prev()218         int prev()
219         {
220             ASSERT(!(pos > length));
221             if (pos && length)
222                 return input[pos - 1];
223             return -1;
224         }
225 
getPos()226         unsigned getPos()
227         {
228             return pos;
229         }
230 
setPos(unsigned p)231         void setPos(unsigned p)
232         {
233             pos = p;
234         }
235 
atStart()236         bool atStart()
237         {
238             return pos == 0;
239         }
240 
atEnd()241         bool atEnd()
242         {
243             return pos == length;
244         }
245 
checkInput(int count)246         bool checkInput(int count)
247         {
248             if ((pos + count) <= length) {
249                 pos += count;
250                 return true;
251             }
252             return false;
253         }
254 
uncheckInput(int count)255         void uncheckInput(int count)
256         {
257             pos -= count;
258         }
259 
atStart(int position)260         bool atStart(int position)
261         {
262             return (pos + position) == 0;
263         }
264 
atEnd(int position)265         bool atEnd(int position)
266         {
267             return (pos + position) == length;
268         }
269 
isNotAvailableInput(int position)270         bool isNotAvailableInput(int position)
271         {
272             return (pos + position) > length;
273         }
274 
275     private:
276         const UChar* input;
277         unsigned pos;
278         unsigned length;
279     };
280 
testCharacterClass(CharacterClass * characterClass,int ch)281     bool testCharacterClass(CharacterClass* characterClass, int ch)
282     {
283         if (ch & 0xFF80) {
284             for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
285                 if (ch == characterClass->m_matchesUnicode[i])
286                     return true;
287             for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
288                 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
289                     return true;
290         } else {
291             for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
292                 if (ch == characterClass->m_matches[i])
293                     return true;
294             for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
295                 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
296                     return true;
297         }
298 
299         return false;
300     }
301 
checkCharacter(int testChar,int inputPosition)302     bool checkCharacter(int testChar, int inputPosition)
303     {
304         return testChar == input.readChecked(inputPosition);
305     }
306 
checkCasedCharacter(int loChar,int hiChar,int inputPosition)307     bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
308     {
309         int ch = input.readChecked(inputPosition);
310         return (loChar == ch) || (hiChar == ch);
311     }
312 
checkCharacterClass(CharacterClass * characterClass,bool invert,int inputPosition)313     bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
314     {
315         bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
316         return invert ? !match : match;
317     }
318 
tryConsumeBackReference(int matchBegin,int matchEnd,int inputOffset)319     bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
320     {
321         int matchSize = matchEnd - matchBegin;
322 
323         if (!input.checkInput(matchSize))
324             return false;
325 
326         if (pattern->m_ignoreCase) {
327             for (int i = 0; i < matchSize; ++i) {
328                 int ch = input.reread(matchBegin + i);
329 
330                 int lo = Unicode::toLower(ch);
331                 int hi = Unicode::toUpper(ch);
332 
333                 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
334                     input.uncheckInput(matchSize);
335                     return false;
336                 }
337             }
338         } else {
339             for (int i = 0; i < matchSize; ++i) {
340                 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
341                     input.uncheckInput(matchSize);
342                     return false;
343                 }
344             }
345         }
346 
347         return true;
348     }
349 
matchAssertionBOL(ByteTerm & term)350     bool matchAssertionBOL(ByteTerm& term)
351     {
352         return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
353     }
354 
matchAssertionEOL(ByteTerm & term)355     bool matchAssertionEOL(ByteTerm& term)
356     {
357         if (term.inputPosition)
358             return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
359 
360         return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
361     }
362 
matchAssertionWordBoundary(ByteTerm & term)363     bool matchAssertionWordBoundary(ByteTerm& term)
364     {
365         bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
366         bool readIsWordchar;
367         if (term.inputPosition)
368             readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
369         else
370             readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
371 
372         bool wordBoundary = prevIsWordchar != readIsWordchar;
373         return term.invert() ? !wordBoundary : wordBoundary;
374     }
375 
backtrackPatternCharacter(ByteTerm & term,DisjunctionContext * context)376     bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
377     {
378         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
379 
380         switch (term.atom.quantityType) {
381         case QuantifierFixedCount:
382             break;
383 
384         case QuantifierGreedy:
385             if (backTrack->matchAmount) {
386                 --backTrack->matchAmount;
387                 input.uncheckInput(1);
388                 return true;
389             }
390             break;
391 
392         case QuantifierNonGreedy:
393             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
394                 ++backTrack->matchAmount;
395                 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
396                     return true;
397             }
398             input.uncheckInput(backTrack->matchAmount);
399             break;
400         }
401 
402         return false;
403     }
404 
backtrackPatternCasedCharacter(ByteTerm & term,DisjunctionContext * context)405     bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
406     {
407         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
408 
409         switch (term.atom.quantityType) {
410         case QuantifierFixedCount:
411             break;
412 
413         case QuantifierGreedy:
414             if (backTrack->matchAmount) {
415                 --backTrack->matchAmount;
416                 input.uncheckInput(1);
417                 return true;
418             }
419             break;
420 
421         case QuantifierNonGreedy:
422             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
423                 ++backTrack->matchAmount;
424                 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
425                     return true;
426             }
427             input.uncheckInput(backTrack->matchAmount);
428             break;
429         }
430 
431         return false;
432     }
433 
matchCharacterClass(ByteTerm & term,DisjunctionContext * context)434     bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
435     {
436         ASSERT(term.type == ByteTerm::TypeCharacterClass);
437         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
438 
439         switch (term.atom.quantityType) {
440         case QuantifierFixedCount: {
441             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
442                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
443                     return false;
444             }
445             return true;
446         }
447 
448         case QuantifierGreedy: {
449             unsigned matchAmount = 0;
450             while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
451                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
452                     input.uncheckInput(1);
453                     break;
454                 }
455                 ++matchAmount;
456             }
457             backTrack->matchAmount = matchAmount;
458 
459             return true;
460         }
461 
462         case QuantifierNonGreedy:
463             backTrack->matchAmount = 0;
464             return true;
465         }
466 
467         ASSERT_NOT_REACHED();
468         return false;
469     }
470 
backtrackCharacterClass(ByteTerm & term,DisjunctionContext * context)471     bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
472     {
473         ASSERT(term.type == ByteTerm::TypeCharacterClass);
474         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
475 
476         switch (term.atom.quantityType) {
477         case QuantifierFixedCount:
478             break;
479 
480         case QuantifierGreedy:
481             if (backTrack->matchAmount) {
482                 --backTrack->matchAmount;
483                 input.uncheckInput(1);
484                 return true;
485             }
486             break;
487 
488         case QuantifierNonGreedy:
489             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
490                 ++backTrack->matchAmount;
491                 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
492                     return true;
493             }
494             input.uncheckInput(backTrack->matchAmount);
495             break;
496         }
497 
498         return false;
499     }
500 
matchBackReference(ByteTerm & term,DisjunctionContext * context)501     bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
502     {
503         ASSERT(term.type == ByteTerm::TypeBackReference);
504         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
505 
506         int matchBegin = output[(term.atom.subpatternId << 1)];
507         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
508 
509         // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
510         // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
511         // Eg.: /(a\1)/
512         if (matchEnd == -1)
513             return true;
514 
515         ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
516 
517         if (matchBegin == matchEnd)
518             return true;
519 
520         switch (term.atom.quantityType) {
521         case QuantifierFixedCount: {
522             backTrack->begin = input.getPos();
523             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
524                 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
525                     input.setPos(backTrack->begin);
526                     return false;
527                 }
528             }
529             return true;
530         }
531 
532         case QuantifierGreedy: {
533             unsigned matchAmount = 0;
534             while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
535                 ++matchAmount;
536             backTrack->matchAmount = matchAmount;
537             return true;
538         }
539 
540         case QuantifierNonGreedy:
541             backTrack->begin = input.getPos();
542             backTrack->matchAmount = 0;
543             return true;
544         }
545 
546         ASSERT_NOT_REACHED();
547         return false;
548     }
549 
backtrackBackReference(ByteTerm & term,DisjunctionContext * context)550     bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
551     {
552         ASSERT(term.type == ByteTerm::TypeBackReference);
553         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
554 
555         int matchBegin = output[(term.atom.subpatternId << 1)];
556         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
557         ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
558 
559         if (matchBegin == matchEnd)
560             return false;
561 
562         switch (term.atom.quantityType) {
563         case QuantifierFixedCount:
564             // for quantityCount == 1, could rewind.
565             input.setPos(backTrack->begin);
566             break;
567 
568         case QuantifierGreedy:
569             if (backTrack->matchAmount) {
570                 --backTrack->matchAmount;
571                 input.rewind(matchEnd - matchBegin);
572                 return true;
573             }
574             break;
575 
576         case QuantifierNonGreedy:
577             if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
578                 ++backTrack->matchAmount;
579                 return true;
580             }
581             input.setPos(backTrack->begin);
582             break;
583         }
584 
585         return false;
586     }
587 
recordParenthesesMatch(ByteTerm & term,ParenthesesDisjunctionContext * context)588     void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
589     {
590         if (term.capture()) {
591             unsigned subpatternId = term.atom.subpatternId;
592             output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
593             output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
594         }
595     }
resetMatches(ByteTerm & term,ParenthesesDisjunctionContext * context)596     void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
597     {
598         unsigned firstSubpatternId = term.atom.subpatternId;
599         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
600         context->restoreOutput(output, firstSubpatternId, count);
601     }
parenthesesDoBacktrack(ByteTerm & term,BackTrackInfoParentheses * backTrack)602     JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
603     {
604         while (backTrack->matchAmount) {
605             ParenthesesDisjunctionContext* context = backTrack->lastContext;
606 
607             JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
608             if (result == JSRegExpMatch)
609                 return JSRegExpMatch;
610 
611             resetMatches(term, context);
612             popParenthesesDisjunctionContext(backTrack);
613             freeParenthesesDisjunctionContext(context);
614 
615             if (result != JSRegExpNoMatch)
616                 return result;
617         }
618 
619         return JSRegExpNoMatch;
620     }
621 
matchParenthesesOnceBegin(ByteTerm & term,DisjunctionContext * context)622     bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
623     {
624         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
625         ASSERT(term.atom.quantityCount == 1);
626 
627         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
628 
629         switch (term.atom.quantityType) {
630         case QuantifierGreedy: {
631             // set this speculatively; if we get to the parens end this will be true.
632             backTrack->begin = input.getPos();
633             break;
634         }
635         case QuantifierNonGreedy: {
636             backTrack->begin = notFound;
637             context->term += term.atom.parenthesesWidth;
638             return true;
639         }
640         case QuantifierFixedCount:
641             break;
642         }
643 
644         if (term.capture()) {
645             unsigned subpatternId = term.atom.subpatternId;
646             output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
647         }
648 
649         return true;
650     }
651 
matchParenthesesOnceEnd(ByteTerm & term,DisjunctionContext * context)652     bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
653     {
654         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
655         ASSERT(term.atom.quantityCount == 1);
656 
657         if (term.capture()) {
658             unsigned subpatternId = term.atom.subpatternId;
659             output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
660         }
661 
662         if (term.atom.quantityType == QuantifierFixedCount)
663             return true;
664 
665         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
666         return backTrack->begin != input.getPos();
667     }
668 
backtrackParenthesesOnceBegin(ByteTerm & term,DisjunctionContext * context)669     bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
670     {
671         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
672         ASSERT(term.atom.quantityCount == 1);
673 
674         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
675 
676         if (term.capture()) {
677             unsigned subpatternId = term.atom.subpatternId;
678             output[(subpatternId << 1)] = -1;
679             output[(subpatternId << 1) + 1] = -1;
680         }
681 
682         switch (term.atom.quantityType) {
683         case QuantifierGreedy:
684             // if we backtrack to this point, there is another chance - try matching nothing.
685             ASSERT(backTrack->begin != notFound);
686             backTrack->begin = notFound;
687             context->term += term.atom.parenthesesWidth;
688             return true;
689         case QuantifierNonGreedy:
690             ASSERT(backTrack->begin != notFound);
691         case QuantifierFixedCount:
692             break;
693         }
694 
695         return false;
696     }
697 
backtrackParenthesesOnceEnd(ByteTerm & term,DisjunctionContext * context)698     bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
699     {
700         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
701         ASSERT(term.atom.quantityCount == 1);
702 
703         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
704 
705         switch (term.atom.quantityType) {
706         case QuantifierGreedy:
707             if (backTrack->begin == notFound) {
708                 context->term -= term.atom.parenthesesWidth;
709                 return false;
710             }
711         case QuantifierNonGreedy:
712             if (backTrack->begin == notFound) {
713                 backTrack->begin = input.getPos();
714                 if (term.capture()) {
715                     // Technically this access to inputPosition should be accessing the begin term's
716                     // inputPosition, but for repeats other than fixed these values should be
717                     // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
718                     ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
719                     ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
720                     unsigned subpatternId = term.atom.subpatternId;
721                     output[subpatternId << 1] = input.getPos() + term.inputPosition;
722                 }
723                 context->term -= term.atom.parenthesesWidth;
724                 return true;
725             }
726         case QuantifierFixedCount:
727             break;
728         }
729 
730         return false;
731     }
732 
matchParenthesesTerminalBegin(ByteTerm & term,DisjunctionContext * context)733     bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
734     {
735         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
736         ASSERT(term.atom.quantityType == QuantifierGreedy);
737         ASSERT(term.atom.quantityCount == quantifyInfinite);
738         ASSERT(!term.capture());
739 
740         BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
741         backTrack->begin = input.getPos();
742         return true;
743     }
744 
matchParenthesesTerminalEnd(ByteTerm & term,DisjunctionContext * context)745     bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
746     {
747         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
748 
749         BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
750         // Empty match is a failed match.
751         if (backTrack->begin == input.getPos())
752             return false;
753 
754         // Successful match! Okay, what's next? - loop around and try to match moar!
755         context->term -= (term.atom.parenthesesWidth + 1);
756         return true;
757     }
758 
backtrackParenthesesTerminalBegin(ByteTerm & term,DisjunctionContext * context)759     bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
760     {
761         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
762         ASSERT(term.atom.quantityType == QuantifierGreedy);
763         ASSERT(term.atom.quantityCount == quantifyInfinite);
764         ASSERT(!term.capture());
765 
766         // If we backtrack to this point, we have failed to match this iteration of the parens.
767         // Since this is greedy / zero minimum a failed is also accepted as a match!
768         context->term += term.atom.parenthesesWidth;
769         return true;
770     }
771 
backtrackParenthesesTerminalEnd(ByteTerm &,DisjunctionContext *)772     bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
773     {
774         // 'Terminal' parentheses are at the end of the regex, and as such a match past end
775         // should always be returned as a successful match - we should never backtrack to here.
776         ASSERT_NOT_REACHED();
777         return false;
778     }
779 
matchParentheticalAssertionBegin(ByteTerm & term,DisjunctionContext * context)780     bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
781     {
782         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
783         ASSERT(term.atom.quantityCount == 1);
784 
785         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
786 
787         backTrack->begin = input.getPos();
788         return true;
789     }
790 
matchParentheticalAssertionEnd(ByteTerm & term,DisjunctionContext * context)791     bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
792     {
793         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
794         ASSERT(term.atom.quantityCount == 1);
795 
796         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
797 
798         input.setPos(backTrack->begin);
799 
800         // We've reached the end of the parens; if they are inverted, this is failure.
801         if (term.invert()) {
802             context->term -= term.atom.parenthesesWidth;
803             return false;
804         }
805 
806         return true;
807     }
808 
backtrackParentheticalAssertionBegin(ByteTerm & term,DisjunctionContext * context)809     bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
810     {
811         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
812         ASSERT(term.atom.quantityCount == 1);
813 
814         // We've failed to match parens; if they are inverted, this is win!
815         if (term.invert()) {
816             context->term += term.atom.parenthesesWidth;
817             return true;
818         }
819 
820         return false;
821     }
822 
backtrackParentheticalAssertionEnd(ByteTerm & term,DisjunctionContext * context)823     bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
824     {
825         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
826         ASSERT(term.atom.quantityCount == 1);
827 
828         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
829 
830         input.setPos(backTrack->begin);
831 
832         context->term -= term.atom.parenthesesWidth;
833         return false;
834     }
835 
matchParentheses(ByteTerm & term,DisjunctionContext * context)836     JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
837     {
838         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
839 
840         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
841         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
842 
843         backTrack->matchAmount = 0;
844         backTrack->lastContext = 0;
845 
846         switch (term.atom.quantityType) {
847         case QuantifierFixedCount: {
848             // While we haven't yet reached our fixed limit,
849             while (backTrack->matchAmount < term.atom.quantityCount) {
850                 // Try to do a match, and it it succeeds, add it to the list.
851                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
852                 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
853                 if (result == JSRegExpMatch)
854                     appendParenthesesDisjunctionContext(backTrack, context);
855                 else {
856                     // The match failed; try to find an alternate point to carry on from.
857                     resetMatches(term, context);
858                     freeParenthesesDisjunctionContext(context);
859 
860                     if (result == JSRegExpNoMatch) {
861                         JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
862                         if (backtrackResult != JSRegExpMatch)
863                             return backtrackResult;
864                     } else
865                         return result;
866                 }
867             }
868 
869             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
870             ParenthesesDisjunctionContext* context = backTrack->lastContext;
871             recordParenthesesMatch(term, context);
872             return JSRegExpMatch;
873         }
874 
875         case QuantifierGreedy: {
876             while (backTrack->matchAmount < term.atom.quantityCount) {
877                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
878                 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
879                 if (result == JSRegExpMatch)
880                     appendParenthesesDisjunctionContext(backTrack, context);
881                 else {
882                     resetMatches(term, context);
883                     freeParenthesesDisjunctionContext(context);
884 
885                     if (result != JSRegExpNoMatch)
886                         return result;
887 
888                     break;
889                 }
890             }
891 
892             if (backTrack->matchAmount) {
893                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
894                 recordParenthesesMatch(term, context);
895             }
896             return JSRegExpMatch;
897         }
898 
899         case QuantifierNonGreedy:
900             return JSRegExpMatch;
901         }
902 
903         ASSERT_NOT_REACHED();
904         return JSRegExpErrorNoMatch;
905     }
906 
907     // Rules for backtracking differ depending on whether this is greedy or non-greedy.
908     //
909     // Greedy matches never should try just adding more - you should already have done
910     // the 'more' cases.  Always backtrack, at least a leetle bit.  However cases where
911     // you backtrack an item off the list needs checking, since we'll never have matched
912     // the one less case.  Tracking forwards, still add as much as possible.
913     //
914     // Non-greedy, we've already done the one less case, so don't match on popping.
915     // We haven't done the one more case, so always try to add that.
916     //
backtrackParentheses(ByteTerm & term,DisjunctionContext * context)917     JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
918     {
919         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
920 
921         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
922         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
923 
924         switch (term.atom.quantityType) {
925         case QuantifierFixedCount: {
926             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
927 
928             ParenthesesDisjunctionContext* context = 0;
929             JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
930 
931             if (result != JSRegExpMatch)
932                 return result;
933 
934             // While we haven't yet reached our fixed limit,
935             while (backTrack->matchAmount < term.atom.quantityCount) {
936                 // Try to do a match, and it it succeeds, add it to the list.
937                 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
938                 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
939 
940                 if (result == JSRegExpMatch)
941                     appendParenthesesDisjunctionContext(backTrack, context);
942                 else {
943                     // The match failed; try to find an alternate point to carry on from.
944                     resetMatches(term, context);
945                     freeParenthesesDisjunctionContext(context);
946 
947                     if (result == JSRegExpNoMatch) {
948                         JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
949                         if (backtrackResult != JSRegExpMatch)
950                             return backtrackResult;
951                     } else
952                         return result;
953                 }
954             }
955 
956             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
957             context = backTrack->lastContext;
958             recordParenthesesMatch(term, context);
959             return JSRegExpMatch;
960         }
961 
962         case QuantifierGreedy: {
963             if (!backTrack->matchAmount)
964                 return JSRegExpNoMatch;
965 
966             ParenthesesDisjunctionContext* context = backTrack->lastContext;
967             JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
968             if (result == JSRegExpMatch) {
969                 while (backTrack->matchAmount < term.atom.quantityCount) {
970                     ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
971                     JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
972                     if (parenthesesResult == JSRegExpMatch)
973                         appendParenthesesDisjunctionContext(backTrack, context);
974                     else {
975                         resetMatches(term, context);
976                         freeParenthesesDisjunctionContext(context);
977 
978                         if (parenthesesResult != JSRegExpNoMatch)
979                             return parenthesesResult;
980 
981                         break;
982                     }
983                 }
984             } else {
985                 resetMatches(term, context);
986                 popParenthesesDisjunctionContext(backTrack);
987                 freeParenthesesDisjunctionContext(context);
988 
989                 if (result != JSRegExpNoMatch)
990                     return result;
991             }
992 
993             if (backTrack->matchAmount) {
994                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
995                 recordParenthesesMatch(term, context);
996             }
997             return JSRegExpMatch;
998         }
999 
1000         case QuantifierNonGreedy: {
1001             // If we've not reached the limit, try to add one more match.
1002             if (backTrack->matchAmount < term.atom.quantityCount) {
1003                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1004                 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1005                 if (result == JSRegExpMatch) {
1006                     appendParenthesesDisjunctionContext(backTrack, context);
1007                     recordParenthesesMatch(term, context);
1008                     return JSRegExpMatch;
1009                 }
1010 
1011                 resetMatches(term, context);
1012                 freeParenthesesDisjunctionContext(context);
1013 
1014                 if (result != JSRegExpNoMatch)
1015                     return result;
1016             }
1017 
1018             // Nope - okay backtrack looking for an alternative.
1019             while (backTrack->matchAmount) {
1020                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1021                 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1022                 if (result == JSRegExpMatch) {
1023                     // successful backtrack! we're back in the game!
1024                     if (backTrack->matchAmount) {
1025                         context = backTrack->lastContext;
1026                         recordParenthesesMatch(term, context);
1027                     }
1028                     return JSRegExpMatch;
1029                 }
1030 
1031                 // pop a match off the stack
1032                 resetMatches(term, context);
1033                 popParenthesesDisjunctionContext(backTrack);
1034                 freeParenthesesDisjunctionContext(context);
1035 
1036                 return result;
1037             }
1038 
1039             return JSRegExpNoMatch;
1040         }
1041         }
1042 
1043         ASSERT_NOT_REACHED();
1044         return JSRegExpErrorNoMatch;
1045     }
1046 
lookupForBeginChars()1047     void lookupForBeginChars()
1048     {
1049         int character;
1050         bool firstSingleCharFound;
1051 
1052         while (true) {
1053             if (input.isNotAvailableInput(2))
1054                 return;
1055 
1056             firstSingleCharFound = false;
1057 
1058             character = input.readPair();
1059 
1060             for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
1061                 BeginChar bc = pattern->m_beginChars[i];
1062 
1063                 if (!firstSingleCharFound && bc.value <= 0xFFFF) {
1064                     firstSingleCharFound = true;
1065                     character &= 0xFFFF;
1066                 }
1067 
1068                 if ((character | bc.mask) == bc.value)
1069                     return;
1070             }
1071 
1072             input.next();
1073         }
1074     }
1075 
1076 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
1077 #define BACKTRACK() { --context->term; goto backtrack; }
1078 #define currentTerm() (disjunction->terms[context->term])
matchDisjunction(ByteDisjunction * disjunction,DisjunctionContext * context,bool btrack=false,bool isBody=false)1079     JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
1080     {
1081         if (!--remainingMatchCount)
1082             return JSRegExpErrorHitLimit;
1083 
1084         if (btrack)
1085             BACKTRACK();
1086 
1087         if (pattern->m_containsBeginChars && isBody)
1088             lookupForBeginChars();
1089 
1090         context->matchBegin = input.getPos();
1091         context->term = 0;
1092 
1093     matchAgain:
1094         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1095 
1096         switch (currentTerm().type) {
1097         case ByteTerm::TypeSubpatternBegin:
1098             MATCH_NEXT();
1099         case ByteTerm::TypeSubpatternEnd:
1100             context->matchEnd = input.getPos();
1101             return JSRegExpMatch;
1102 
1103         case ByteTerm::TypeBodyAlternativeBegin:
1104             MATCH_NEXT();
1105         case ByteTerm::TypeBodyAlternativeDisjunction:
1106         case ByteTerm::TypeBodyAlternativeEnd:
1107             context->matchEnd = input.getPos();
1108             return JSRegExpMatch;
1109 
1110         case ByteTerm::TypeAlternativeBegin:
1111             MATCH_NEXT();
1112         case ByteTerm::TypeAlternativeDisjunction:
1113         case ByteTerm::TypeAlternativeEnd: {
1114             int offset = currentTerm().alternative.end;
1115             BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1116             backTrack->offset = offset;
1117             context->term += offset;
1118             MATCH_NEXT();
1119         }
1120 
1121         case ByteTerm::TypeAssertionBOL:
1122             if (matchAssertionBOL(currentTerm()))
1123                 MATCH_NEXT();
1124             BACKTRACK();
1125         case ByteTerm::TypeAssertionEOL:
1126             if (matchAssertionEOL(currentTerm()))
1127                 MATCH_NEXT();
1128             BACKTRACK();
1129         case ByteTerm::TypeAssertionWordBoundary:
1130             if (matchAssertionWordBoundary(currentTerm()))
1131                 MATCH_NEXT();
1132             BACKTRACK();
1133 
1134         case ByteTerm::TypePatternCharacterOnce:
1135         case ByteTerm::TypePatternCharacterFixed: {
1136             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1137                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1138                     BACKTRACK();
1139             }
1140             MATCH_NEXT();
1141         }
1142         case ByteTerm::TypePatternCharacterGreedy: {
1143             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1144             unsigned matchAmount = 0;
1145             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1146                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1147                     input.uncheckInput(1);
1148                     break;
1149                 }
1150                 ++matchAmount;
1151             }
1152             backTrack->matchAmount = matchAmount;
1153 
1154             MATCH_NEXT();
1155         }
1156         case ByteTerm::TypePatternCharacterNonGreedy: {
1157             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1158             backTrack->matchAmount = 0;
1159             MATCH_NEXT();
1160         }
1161 
1162         case ByteTerm::TypePatternCasedCharacterOnce:
1163         case ByteTerm::TypePatternCasedCharacterFixed: {
1164             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1165                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1166                     BACKTRACK();
1167             }
1168             MATCH_NEXT();
1169         }
1170         case ByteTerm::TypePatternCasedCharacterGreedy: {
1171             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1172             unsigned matchAmount = 0;
1173             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1174                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1175                     input.uncheckInput(1);
1176                     break;
1177                 }
1178                 ++matchAmount;
1179             }
1180             backTrack->matchAmount = matchAmount;
1181 
1182             MATCH_NEXT();
1183         }
1184         case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1185             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1186             backTrack->matchAmount = 0;
1187             MATCH_NEXT();
1188         }
1189 
1190         case ByteTerm::TypeCharacterClass:
1191             if (matchCharacterClass(currentTerm(), context))
1192                 MATCH_NEXT();
1193             BACKTRACK();
1194         case ByteTerm::TypeBackReference:
1195             if (matchBackReference(currentTerm(), context))
1196                 MATCH_NEXT();
1197             BACKTRACK();
1198         case ByteTerm::TypeParenthesesSubpattern: {
1199             JSRegExpResult result = matchParentheses(currentTerm(), context);
1200 
1201             if (result == JSRegExpMatch) {
1202                 MATCH_NEXT();
1203             }  else if (result != JSRegExpNoMatch)
1204                 return result;
1205 
1206             BACKTRACK();
1207         }
1208         case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1209             if (matchParenthesesOnceBegin(currentTerm(), context))
1210                 MATCH_NEXT();
1211             BACKTRACK();
1212         case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1213             if (matchParenthesesOnceEnd(currentTerm(), context))
1214                 MATCH_NEXT();
1215             BACKTRACK();
1216         case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1217             if (matchParenthesesTerminalBegin(currentTerm(), context))
1218                 MATCH_NEXT();
1219             BACKTRACK();
1220         case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1221             if (matchParenthesesTerminalEnd(currentTerm(), context))
1222                 MATCH_NEXT();
1223             BACKTRACK();
1224         case ByteTerm::TypeParentheticalAssertionBegin:
1225             if (matchParentheticalAssertionBegin(currentTerm(), context))
1226                 MATCH_NEXT();
1227             BACKTRACK();
1228         case ByteTerm::TypeParentheticalAssertionEnd:
1229             if (matchParentheticalAssertionEnd(currentTerm(), context))
1230                 MATCH_NEXT();
1231             BACKTRACK();
1232 
1233         case ByteTerm::TypeCheckInput:
1234             if (input.checkInput(currentTerm().checkInputCount))
1235                 MATCH_NEXT();
1236             BACKTRACK();
1237 
1238             case ByteTerm::TypeUncheckInput:
1239                 input.uncheckInput(currentTerm().checkInputCount);
1240                 MATCH_NEXT();
1241         }
1242 
1243         // We should never fall-through to here.
1244         ASSERT_NOT_REACHED();
1245 
1246     backtrack:
1247         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1248 
1249         switch (currentTerm().type) {
1250         case ByteTerm::TypeSubpatternBegin:
1251             return JSRegExpNoMatch;
1252         case ByteTerm::TypeSubpatternEnd:
1253             ASSERT_NOT_REACHED();
1254 
1255         case ByteTerm::TypeBodyAlternativeBegin:
1256         case ByteTerm::TypeBodyAlternativeDisjunction: {
1257             int offset = currentTerm().alternative.next;
1258             context->term += offset;
1259             if (offset > 0)
1260                 MATCH_NEXT();
1261 
1262             if (input.atEnd())
1263                 return JSRegExpNoMatch;
1264 
1265             input.next();
1266 
1267             if (pattern->m_containsBeginChars && isBody)
1268                 lookupForBeginChars();
1269 
1270             context->matchBegin = input.getPos();
1271 
1272             if (currentTerm().alternative.onceThrough)
1273                 context->term += currentTerm().alternative.next;
1274 
1275             MATCH_NEXT();
1276         }
1277         case ByteTerm::TypeBodyAlternativeEnd:
1278             ASSERT_NOT_REACHED();
1279 
1280             case ByteTerm::TypeAlternativeBegin:
1281             case ByteTerm::TypeAlternativeDisjunction: {
1282                 int offset = currentTerm().alternative.next;
1283                 context->term += offset;
1284                 if (offset > 0)
1285                     MATCH_NEXT();
1286                 BACKTRACK();
1287             }
1288             case ByteTerm::TypeAlternativeEnd: {
1289                 // We should never backtrack back into an alternative of the main body of the regex.
1290                 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1291                 unsigned offset = backTrack->offset;
1292                 context->term -= offset;
1293                 BACKTRACK();
1294             }
1295 
1296             case ByteTerm::TypeAssertionBOL:
1297             case ByteTerm::TypeAssertionEOL:
1298             case ByteTerm::TypeAssertionWordBoundary:
1299                 BACKTRACK();
1300 
1301             case ByteTerm::TypePatternCharacterOnce:
1302             case ByteTerm::TypePatternCharacterFixed:
1303             case ByteTerm::TypePatternCharacterGreedy:
1304             case ByteTerm::TypePatternCharacterNonGreedy:
1305                 if (backtrackPatternCharacter(currentTerm(), context))
1306                     MATCH_NEXT();
1307                 BACKTRACK();
1308             case ByteTerm::TypePatternCasedCharacterOnce:
1309             case ByteTerm::TypePatternCasedCharacterFixed:
1310             case ByteTerm::TypePatternCasedCharacterGreedy:
1311             case ByteTerm::TypePatternCasedCharacterNonGreedy:
1312                 if (backtrackPatternCasedCharacter(currentTerm(), context))
1313                     MATCH_NEXT();
1314                 BACKTRACK();
1315             case ByteTerm::TypeCharacterClass:
1316                 if (backtrackCharacterClass(currentTerm(), context))
1317                     MATCH_NEXT();
1318                 BACKTRACK();
1319             case ByteTerm::TypeBackReference:
1320                 if (backtrackBackReference(currentTerm(), context))
1321                     MATCH_NEXT();
1322                 BACKTRACK();
1323             case ByteTerm::TypeParenthesesSubpattern: {
1324                 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1325 
1326                 if (result == JSRegExpMatch) {
1327                     MATCH_NEXT();
1328                 } else if (result != JSRegExpNoMatch)
1329                     return result;
1330 
1331                 BACKTRACK();
1332             }
1333             case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1334                 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1335                     MATCH_NEXT();
1336                 BACKTRACK();
1337             case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1338                 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1339                     MATCH_NEXT();
1340                 BACKTRACK();
1341             case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1342                 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1343                     MATCH_NEXT();
1344                 BACKTRACK();
1345             case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1346                 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1347                     MATCH_NEXT();
1348                 BACKTRACK();
1349             case ByteTerm::TypeParentheticalAssertionBegin:
1350                 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1351                     MATCH_NEXT();
1352                 BACKTRACK();
1353             case ByteTerm::TypeParentheticalAssertionEnd:
1354                 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1355                     MATCH_NEXT();
1356                 BACKTRACK();
1357 
1358             case ByteTerm::TypeCheckInput:
1359                 input.uncheckInput(currentTerm().checkInputCount);
1360                 BACKTRACK();
1361 
1362             case ByteTerm::TypeUncheckInput:
1363                 input.checkInput(currentTerm().checkInputCount);
1364                 BACKTRACK();
1365         }
1366 
1367         ASSERT_NOT_REACHED();
1368         return JSRegExpErrorNoMatch;
1369     }
1370 
matchNonZeroDisjunction(ByteDisjunction * disjunction,DisjunctionContext * context,bool btrack=false)1371     JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1372     {
1373         JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1374 
1375         if (result == JSRegExpMatch) {
1376             while (context->matchBegin == context->matchEnd) {
1377                 result = matchDisjunction(disjunction, context, true);
1378                 if (result != JSRegExpMatch)
1379                     return result;
1380             }
1381             return JSRegExpMatch;
1382         }
1383 
1384         return result;
1385     }
1386 
interpret()1387     int interpret()
1388     {
1389         allocatorPool = pattern->m_allocator->startAllocator();
1390         if (!allocatorPool)
1391             CRASH();
1392 
1393         for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1394             output[i] = -1;
1395 
1396         DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1397 
1398         JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
1399         if (result == JSRegExpMatch) {
1400             output[0] = context->matchBegin;
1401             output[1] = context->matchEnd;
1402         }
1403 
1404         freeDisjunctionContext(context);
1405 
1406         pattern->m_allocator->stopAllocator();
1407 
1408         // RegExp.cpp currently expects all error to be converted to -1.
1409         ASSERT((result == JSRegExpMatch) == (output[0] != -1));
1410         return output[0];
1411     }
1412 
Interpreter(BytecodePattern * pattern,int * output,const UChar * inputChar,unsigned start,unsigned length)1413     Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1414         : pattern(pattern)
1415         , output(output)
1416         , input(inputChar, start, length)
1417         , allocatorPool(0)
1418         , remainingMatchCount(matchLimit)
1419     {
1420     }
1421 
1422 private:
1423     BytecodePattern* pattern;
1424     int* output;
1425     InputStream input;
1426     BumpPointerPool* allocatorPool;
1427     unsigned remainingMatchCount;
1428 };
1429 
1430 
1431 
1432 class ByteCompiler {
1433     struct ParenthesesStackEntry {
1434         unsigned beginTerm;
1435         unsigned savedAlternativeIndex;
ParenthesesStackEntryJSC::Yarr::ByteCompiler::ParenthesesStackEntry1436         ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1437             : beginTerm(beginTerm)
1438             , savedAlternativeIndex(savedAlternativeIndex)
1439         {
1440         }
1441     };
1442 
1443 public:
ByteCompiler(YarrPattern & pattern)1444     ByteCompiler(YarrPattern& pattern)
1445         : m_pattern(pattern)
1446     {
1447         m_currentAlternativeIndex = 0;
1448     }
1449 
compile(BumpPointerAllocator * allocator)1450     PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1451     {
1452         regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1453         emitDisjunction(m_pattern.m_body);
1454         regexEnd();
1455 
1456         return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1457     }
1458 
checkInput(unsigned count)1459     void checkInput(unsigned count)
1460     {
1461         m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1462     }
1463 
uncheckInput(unsigned count)1464     void uncheckInput(unsigned count)
1465     {
1466         m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1467     }
1468 
assertionBOL(int inputPosition)1469     void assertionBOL(int inputPosition)
1470     {
1471         m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1472     }
1473 
assertionEOL(int inputPosition)1474     void assertionEOL(int inputPosition)
1475     {
1476         m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1477     }
1478 
assertionWordBoundary(bool invert,int inputPosition)1479     void assertionWordBoundary(bool invert, int inputPosition)
1480     {
1481         m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1482     }
1483 
atomPatternCharacter(UChar ch,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1484     void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1485     {
1486         if (m_pattern.m_ignoreCase) {
1487             UChar lo = Unicode::toLower(ch);
1488             UChar hi = Unicode::toUpper(ch);
1489 
1490             if (lo != hi) {
1491                 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1492                 return;
1493             }
1494         }
1495 
1496         m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1497     }
1498 
atomCharacterClass(CharacterClass * characterClass,bool invert,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1499     void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1500     {
1501         m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1502 
1503         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1504         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1505         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1506     }
1507 
atomBackReference(unsigned subpatternId,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1508     void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1509     {
1510         ASSERT(subpatternId);
1511 
1512         m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1513 
1514         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1515         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1516         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1517     }
1518 
atomParenthesesOnceBegin(unsigned subpatternId,bool capture,int inputPosition,unsigned frameLocation,unsigned alternativeFrameLocation)1519     void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1520     {
1521         int beginTerm = m_bodyDisjunction->terms.size();
1522 
1523         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1524         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1525         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1526         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1527 
1528         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1529         m_currentAlternativeIndex = beginTerm + 1;
1530     }
1531 
atomParenthesesTerminalBegin(unsigned subpatternId,bool capture,int inputPosition,unsigned frameLocation,unsigned alternativeFrameLocation)1532     void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1533     {
1534         int beginTerm = m_bodyDisjunction->terms.size();
1535 
1536         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1537         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1538         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1539         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1540 
1541         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1542         m_currentAlternativeIndex = beginTerm + 1;
1543     }
1544 
atomParenthesesSubpatternBegin(unsigned subpatternId,bool capture,int inputPosition,unsigned frameLocation,unsigned alternativeFrameLocation)1545     void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1546     {
1547         // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1548         // then fix this up at the end! - simplifying this should make it much clearer.
1549         // https://bugs.webkit.org/show_bug.cgi?id=50136
1550 
1551         int beginTerm = m_bodyDisjunction->terms.size();
1552 
1553         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1554         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1555         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1556         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1557 
1558         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1559         m_currentAlternativeIndex = beginTerm + 1;
1560     }
1561 
atomParentheticalAssertionBegin(unsigned subpatternId,bool invert,unsigned frameLocation,unsigned alternativeFrameLocation)1562     void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1563     {
1564         int beginTerm = m_bodyDisjunction->terms.size();
1565 
1566         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1567         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1568         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1569         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1570 
1571         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1572         m_currentAlternativeIndex = beginTerm + 1;
1573     }
1574 
atomParentheticalAssertionEnd(int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1575     void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1576     {
1577         unsigned beginTerm = popParenthesesStack();
1578         closeAlternative(beginTerm + 1);
1579         unsigned endTerm = m_bodyDisjunction->terms.size();
1580 
1581         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1582 
1583         bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1584         unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1585 
1586         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1587         m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1588         m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1589         m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1590 
1591         m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1592         m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1593         m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1594         m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1595     }
1596 
popParenthesesStack()1597     unsigned popParenthesesStack()
1598     {
1599         ASSERT(m_parenthesesStack.size());
1600         int stackEnd = m_parenthesesStack.size() - 1;
1601         unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1602         m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1603         m_parenthesesStack.shrink(stackEnd);
1604 
1605         ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1606         ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1607 
1608         return beginTerm;
1609     }
1610 
1611 #ifndef NDEBUG
dumpDisjunction(ByteDisjunction * disjunction)1612     void dumpDisjunction(ByteDisjunction* disjunction)
1613     {
1614         printf("ByteDisjunction(%p):\n\t", disjunction);
1615         for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1616             printf("{ %d } ", disjunction->terms[i].type);
1617         printf("\n");
1618     }
1619 #endif
1620 
closeAlternative(int beginTerm)1621     void closeAlternative(int beginTerm)
1622     {
1623         int origBeginTerm = beginTerm;
1624         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1625         int endIndex = m_bodyDisjunction->terms.size();
1626 
1627         unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1628 
1629         if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1630             m_bodyDisjunction->terms.remove(beginTerm);
1631         else {
1632             while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1633                 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1634                 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1635                 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1636                 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1637             }
1638 
1639             m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1640 
1641             m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1642             m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1643         }
1644     }
1645 
closeBodyAlternative()1646     void closeBodyAlternative()
1647     {
1648         int beginTerm = 0;
1649         int origBeginTerm = 0;
1650         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1651         int endIndex = m_bodyDisjunction->terms.size();
1652 
1653         unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1654 
1655         while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1656             beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1657             ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1658             m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1659             m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1660         }
1661 
1662         m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1663 
1664         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1665         m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1666     }
1667 
atomParenthesesSubpatternEnd(unsigned lastSubpatternId,int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType,unsigned callFrameSize=0)1668     void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1669     {
1670         unsigned beginTerm = popParenthesesStack();
1671         closeAlternative(beginTerm + 1);
1672         unsigned endTerm = m_bodyDisjunction->terms.size();
1673 
1674         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1675 
1676         ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1677 
1678         bool capture = parenthesesBegin.capture();
1679         unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1680 
1681         unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1682         ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1683 
1684         parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1685         for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1686             parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1687         parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1688 
1689         m_bodyDisjunction->terms.shrink(beginTerm);
1690 
1691         m_allParenthesesInfo.append(parenthesesDisjunction);
1692         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
1693 
1694         m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1695         m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1696         m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1697     }
1698 
atomParenthesesOnceEnd(int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1699     void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1700     {
1701         unsigned beginTerm = popParenthesesStack();
1702         closeAlternative(beginTerm + 1);
1703         unsigned endTerm = m_bodyDisjunction->terms.size();
1704 
1705         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1706 
1707         bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1708         unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1709 
1710         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1711         m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1712         m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1713         m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1714 
1715         m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1716         m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1717         m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1718         m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1719     }
1720 
atomParenthesesTerminalEnd(int inputPosition,unsigned frameLocation,unsigned quantityCount,QuantifierType quantityType)1721     void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1722     {
1723         unsigned beginTerm = popParenthesesStack();
1724         closeAlternative(beginTerm + 1);
1725         unsigned endTerm = m_bodyDisjunction->terms.size();
1726 
1727         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1728 
1729         bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1730         unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1731 
1732         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1733         m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1734         m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1735         m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1736 
1737         m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1738         m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1739         m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1740         m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1741     }
1742 
regexBegin(unsigned numSubpatterns,unsigned callFrameSize,bool onceThrough)1743     void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1744     {
1745         m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1746         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1747         m_bodyDisjunction->terms[0].frameLocation = 0;
1748         m_currentAlternativeIndex = 0;
1749     }
1750 
regexEnd()1751     void regexEnd()
1752     {
1753         closeBodyAlternative();
1754     }
1755 
alternativeBodyDisjunction(bool onceThrough)1756     void alternativeBodyDisjunction(bool onceThrough)
1757     {
1758         int newAlternativeIndex = m_bodyDisjunction->terms.size();
1759         m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1760         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1761 
1762         m_currentAlternativeIndex = newAlternativeIndex;
1763     }
1764 
alternativeDisjunction()1765     void alternativeDisjunction()
1766     {
1767         int newAlternativeIndex = m_bodyDisjunction->terms.size();
1768         m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1769         m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1770 
1771         m_currentAlternativeIndex = newAlternativeIndex;
1772     }
1773 
emitDisjunction(PatternDisjunction * disjunction,unsigned inputCountAlreadyChecked=0,unsigned parenthesesInputCountAlreadyChecked=0,bool isParentheticalAssertion=false)1774     void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false)
1775     {
1776         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1777             unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1778 
1779             PatternAlternative* alternative = disjunction->m_alternatives[alt];
1780 
1781             if (alt) {
1782                 if (disjunction == m_pattern.m_body)
1783                     alternativeBodyDisjunction(alternative->onceThrough());
1784                 else
1785                     alternativeDisjunction();
1786             }
1787 
1788             unsigned minimumSize = alternative->m_minimumSize;
1789             int countToCheck;
1790 
1791             if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize)
1792                 countToCheck = 0;
1793             else
1794                 countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1795 
1796             ASSERT(countToCheck >= 0);
1797             if (countToCheck) {
1798                 checkInput(countToCheck);
1799                 currentCountAlreadyChecked += countToCheck;
1800             }
1801 
1802             for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1803                 PatternTerm& term = alternative->m_terms[i];
1804 
1805                 switch (term.type) {
1806                 case PatternTerm::TypeAssertionBOL:
1807                     assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1808                     break;
1809 
1810                 case PatternTerm::TypeAssertionEOL:
1811                     assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1812                     break;
1813 
1814                 case PatternTerm::TypeAssertionWordBoundary:
1815                     assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked);
1816                     break;
1817 
1818                 case PatternTerm::TypePatternCharacter:
1819                     atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1820                     break;
1821 
1822                 case PatternTerm::TypeCharacterClass:
1823                     atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1824                     break;
1825 
1826                 case PatternTerm::TypeBackReference:
1827                     atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1828                         break;
1829 
1830                 case PatternTerm::TypeForwardReference:
1831                     break;
1832 
1833                 case PatternTerm::TypeParenthesesSubpattern: {
1834                     unsigned disjunctionAlreadyCheckedCount = 0;
1835                     if (term.quantityCount == 1 && !term.parentheses.isCopy) {
1836                         unsigned alternativeFrameLocation = term.frameLocation;
1837                         // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
1838                         if (term.quantityType == QuantifierFixedCount)
1839                             disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1840                         else
1841                             alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1842                         unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1843                         atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation);
1844                         emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1845                         atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1846                     } else if (term.parentheses.isTerminal) {
1847                         unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1848                         atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
1849                         emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1850                         atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1851                     } else {
1852                         unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1853                         atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1854                         emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1855                         atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1856                     }
1857                     break;
1858                 }
1859 
1860                 case PatternTerm::TypeParentheticalAssertion: {
1861                     unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
1862 
1863                     ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
1864                     int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
1865                     int uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
1866 
1867                     if (uncheckAmount > 0) {
1868                         uncheckInput(uncheckAmount);
1869                         currentCountAlreadyChecked -= uncheckAmount;
1870                     } else
1871                         uncheckAmount = 0;
1872 
1873                     atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
1874                     emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
1875                     atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
1876                     if (uncheckAmount) {
1877                         checkInput(uncheckAmount);
1878                         currentCountAlreadyChecked += uncheckAmount;
1879                     }
1880                     break;
1881                 }
1882                 }
1883             }
1884         }
1885     }
1886 
1887 private:
1888     YarrPattern& m_pattern;
1889     OwnPtr<ByteDisjunction> m_bodyDisjunction;
1890     unsigned m_currentAlternativeIndex;
1891     Vector<ParenthesesStackEntry> m_parenthesesStack;
1892     Vector<ByteDisjunction*> m_allParenthesesInfo;
1893 };
1894 
byteCompile(YarrPattern & pattern,BumpPointerAllocator * allocator)1895 PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
1896 {
1897     return ByteCompiler(pattern).compile(allocator);
1898 }
1899 
interpret(BytecodePattern * bytecode,const UChar * input,unsigned start,unsigned length,int * output)1900 int interpret(BytecodePattern* bytecode, const UChar* input, unsigned start, unsigned length, int* output)
1901 {
1902     return Interpreter(bytecode, output, input, start, length).interpret();
1903 }
1904 
1905 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
1906 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
1907 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
1908 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
1909 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
1910 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
1911 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
1912 
1913 
1914 } }
1915