• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "config.h"
27 #include "CharacterClassConstructor.h"
28 
29 #if ENABLE(WREC)
30 
31 #include "pcre_internal.h"
32 #include <wtf/ASCIICType.h>
33 
34 using namespace WTF;
35 
36 namespace JSC { namespace WREC {
37 
addSorted(Vector<UChar> & matches,UChar ch)38 void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
39 {
40     unsigned pos = 0;
41     unsigned range = matches.size();
42 
43     // binary chop, find position to insert char.
44     while (range) {
45         unsigned index = range >> 1;
46 
47         int val = matches[pos+index] - ch;
48         if (!val)
49             return;
50         else if (val > 0)
51             range = index;
52         else {
53             pos += (index+1);
54             range -= (index+1);
55         }
56     }
57 
58     if (pos == matches.size())
59         matches.append(ch);
60     else
61         matches.insert(pos, ch);
62 }
63 
addSortedRange(Vector<CharacterRange> & ranges,UChar lo,UChar hi)64 void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
65 {
66     unsigned end = ranges.size();
67 
68     // Simple linear scan - I doubt there are that many ranges anyway...
69     // feel free to fix this with something faster (eg binary chop).
70     for (unsigned i = 0; i < end; ++i) {
71         // does the new range fall before the current position in the array
72         if (hi < ranges[i].begin) {
73             // optional optimization: concatenate appending ranges? - may not be worthwhile.
74             if (hi == (ranges[i].begin - 1)) {
75                 ranges[i].begin = lo;
76                 return;
77             }
78             CharacterRange r = {lo, hi};
79             ranges.insert(i, r);
80             return;
81         }
82         // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
83         // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
84         // end of the last range they concatenate, which is just as good.
85         if (lo <= (ranges[i].end + 1)) {
86             // found an intersect! we'll replace this entry in the array.
87             ranges[i].begin = std::min(ranges[i].begin, lo);
88             ranges[i].end = std::max(ranges[i].end, hi);
89 
90             // now check if the new range can subsume any subsequent ranges.
91             unsigned next = i+1;
92             // each iteration of the loop we will either remove something from the list, or break the loop.
93             while (next < ranges.size()) {
94                 if (ranges[next].begin <= (ranges[i].end + 1)) {
95                     // the next entry now overlaps / concatenates this one.
96                     ranges[i].end = std::max(ranges[i].end, ranges[next].end);
97                     ranges.remove(next);
98                 } else
99                     break;
100             }
101 
102             return;
103         }
104     }
105 
106     // CharacterRange comes after all existing ranges.
107     CharacterRange r = {lo, hi};
108     ranges.append(r);
109 }
110 
put(UChar ch)111 void CharacterClassConstructor::put(UChar ch)
112 {
113     // Parsing a regular expression like [a-z], we start in an initial empty state:
114     //     ((m_charBuffer == -1) && !m_isPendingDash)
115     // When buffer the 'a' sice it may be (and is in this case) part of a range:
116     //     ((m_charBuffer != -1) && !m_isPendingDash)
117     // Having parsed the hyphen we then record that the dash is also pending:
118     //     ((m_charBuffer != -1) && m_isPendingDash)
119     // The next change will always take us back to the initial state - either because
120     // a complete range has been parsed (such as [a-z]), or because a flush is forced,
121     // due to an early end in the regexp ([a-]), or a character class escape being added
122     // ([a-\s]).  The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
123     ASSERT(!((m_charBuffer == -1) && m_isPendingDash));
124 
125     if (m_charBuffer != -1) {
126         if (m_isPendingDash) {
127             // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
128             UChar lo = m_charBuffer;
129             UChar hi = ch;
130             // Reset back to the inital state.
131             m_charBuffer = -1;
132             m_isPendingDash = false;
133 
134             // This is an error, detected lazily.  Do not proceed.
135             if (lo > hi) {
136                 m_isUpsideDown = true;
137                 return;
138             }
139 
140             if (lo <= 0x7f) {
141                 char asciiLo = lo;
142                 char asciiHi = std::min(hi, (UChar)0x7f);
143                 addSortedRange(m_ranges, lo, asciiHi);
144 
145                 if (m_isCaseInsensitive) {
146                     if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
147                         addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
148                     if ((asciiLo <= 'z') && (asciiHi >= 'a'))
149                         addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
150                 }
151             }
152             if (hi >= 0x80) {
153                 UChar unicodeCurr = std::max(lo, (UChar)0x80);
154                 addSortedRange(m_rangesUnicode, unicodeCurr, hi);
155 
156                 if (m_isCaseInsensitive) {
157                     // we're going to scan along, updating the start of the range
158                     while (unicodeCurr <= hi) {
159                         // Spin forwards over any characters that don't have two cases.
160                         for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
161                             // if this was the last character in the range, we're done.
162                             if (unicodeCurr == hi)
163                                 return;
164                         }
165                         // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
166                         UChar rangeStart = unicodeCurr;
167                         UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
168 
169                         // If unicodeCurr is not yet hi, check the next char in the range.  If it also has another case,
170                         // and if it's other case value is one greater then the othercase value for the current last
171                         // character included in the range, we can include next into the range.
172                         while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
173                             // increment unicodeCurr; it points to the end of the range.
174                             // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
175                             ++unicodeCurr;
176                             ++otherCurr;
177                         }
178 
179                         // otherChar is the last in the range of other case chars, calculate offset to get back to the start.
180                         addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
181 
182                         // unicodeCurr has been added, move on to the next char.
183                         ++unicodeCurr;
184                     }
185                 }
186             }
187         } else if (ch == '-')
188             // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
189             m_isPendingDash = true;
190         else {
191             // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
192             flush();
193             m_charBuffer = ch;
194         }
195     } else
196         // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
197         // (the hyphen not treated as a special character in this case, same handling for any char).
198         m_charBuffer = ch;
199 }
200 
201 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
202 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
203 // array (either ascii or unicode).
204 // If the pattern is case insensitive we add entries for both cases.
flush()205 void CharacterClassConstructor::flush()
206 {
207     if (m_charBuffer != -1) {
208         if (m_charBuffer <= 0x7f) {
209             if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
210                 addSorted(m_matches, toASCIIUpper(m_charBuffer));
211             addSorted(m_matches, m_charBuffer);
212             if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
213                 addSorted(m_matches, toASCIILower(m_charBuffer));
214         } else {
215             addSorted(m_matchesUnicode, m_charBuffer);
216             if (m_isCaseInsensitive) {
217                 int other = jsc_pcre_ucp_othercase(m_charBuffer);
218                 if (other != -1)
219                     addSorted(m_matchesUnicode, other);
220             }
221         }
222         m_charBuffer = -1;
223     }
224 
225     if (m_isPendingDash) {
226         addSorted(m_matches, '-');
227         m_isPendingDash = false;
228     }
229 }
230 
append(const CharacterClass & other)231 void CharacterClassConstructor::append(const CharacterClass& other)
232 {
233     // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
234     // Need to check the spec, really, but think this matches PCRE behaviour.
235     flush();
236 
237     if (other.numMatches) {
238         for (size_t i = 0; i < other.numMatches; ++i)
239             addSorted(m_matches, other.matches[i]);
240     }
241     if (other.numRanges) {
242         for (size_t i = 0; i < other.numRanges; ++i)
243             addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
244     }
245     if (other.numMatchesUnicode) {
246         for (size_t i = 0; i < other.numMatchesUnicode; ++i)
247             addSorted(m_matchesUnicode, other.matchesUnicode[i]);
248     }
249     if (other.numRangesUnicode) {
250         for (size_t i = 0; i < other.numRangesUnicode; ++i)
251             addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
252     }
253 }
254 
255 } } // namespace JSC::WREC
256 
257 #endif // ENABLE(WREC)
258