1 /*
2 * Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "CharacterClassConstructor.h"
28
29 #if ENABLE(WREC)
30
31 #include "pcre_internal.h"
32 #include <wtf/ASCIICType.h>
33
34 using namespace WTF;
35
36 namespace JSC { namespace WREC {
37
addSorted(Vector<UChar> & matches,UChar ch)38 void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
39 {
40 unsigned pos = 0;
41 unsigned range = matches.size();
42
43 // binary chop, find position to insert char.
44 while (range) {
45 unsigned index = range >> 1;
46
47 int val = matches[pos+index] - ch;
48 if (!val)
49 return;
50 else if (val > 0)
51 range = index;
52 else {
53 pos += (index+1);
54 range -= (index+1);
55 }
56 }
57
58 if (pos == matches.size())
59 matches.append(ch);
60 else
61 matches.insert(pos, ch);
62 }
63
addSortedRange(Vector<CharacterRange> & ranges,UChar lo,UChar hi)64 void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
65 {
66 unsigned end = ranges.size();
67
68 // Simple linear scan - I doubt there are that many ranges anyway...
69 // feel free to fix this with something faster (eg binary chop).
70 for (unsigned i = 0; i < end; ++i) {
71 // does the new range fall before the current position in the array
72 if (hi < ranges[i].begin) {
73 // optional optimization: concatenate appending ranges? - may not be worthwhile.
74 if (hi == (ranges[i].begin - 1)) {
75 ranges[i].begin = lo;
76 return;
77 }
78 CharacterRange r = {lo, hi};
79 ranges.insert(i, r);
80 return;
81 }
82 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
83 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
84 // end of the last range they concatenate, which is just as good.
85 if (lo <= (ranges[i].end + 1)) {
86 // found an intersect! we'll replace this entry in the array.
87 ranges[i].begin = std::min(ranges[i].begin, lo);
88 ranges[i].end = std::max(ranges[i].end, hi);
89
90 // now check if the new range can subsume any subsequent ranges.
91 unsigned next = i+1;
92 // each iteration of the loop we will either remove something from the list, or break the loop.
93 while (next < ranges.size()) {
94 if (ranges[next].begin <= (ranges[i].end + 1)) {
95 // the next entry now overlaps / concatenates this one.
96 ranges[i].end = std::max(ranges[i].end, ranges[next].end);
97 ranges.remove(next);
98 } else
99 break;
100 }
101
102 return;
103 }
104 }
105
106 // CharacterRange comes after all existing ranges.
107 CharacterRange r = {lo, hi};
108 ranges.append(r);
109 }
110
put(UChar ch)111 void CharacterClassConstructor::put(UChar ch)
112 {
113 // Parsing a regular expression like [a-z], we start in an initial empty state:
114 // ((m_charBuffer == -1) && !m_isPendingDash)
115 // When buffer the 'a' sice it may be (and is in this case) part of a range:
116 // ((m_charBuffer != -1) && !m_isPendingDash)
117 // Having parsed the hyphen we then record that the dash is also pending:
118 // ((m_charBuffer != -1) && m_isPendingDash)
119 // The next change will always take us back to the initial state - either because
120 // a complete range has been parsed (such as [a-z]), or because a flush is forced,
121 // due to an early end in the regexp ([a-]), or a character class escape being added
122 // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
123 ASSERT(!((m_charBuffer == -1) && m_isPendingDash));
124
125 if (m_charBuffer != -1) {
126 if (m_isPendingDash) {
127 // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
128 UChar lo = m_charBuffer;
129 UChar hi = ch;
130 // Reset back to the inital state.
131 m_charBuffer = -1;
132 m_isPendingDash = false;
133
134 // This is an error, detected lazily. Do not proceed.
135 if (lo > hi) {
136 m_isUpsideDown = true;
137 return;
138 }
139
140 if (lo <= 0x7f) {
141 char asciiLo = lo;
142 char asciiHi = std::min(hi, (UChar)0x7f);
143 addSortedRange(m_ranges, lo, asciiHi);
144
145 if (m_isCaseInsensitive) {
146 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
147 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
148 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
149 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
150 }
151 }
152 if (hi >= 0x80) {
153 UChar unicodeCurr = std::max(lo, (UChar)0x80);
154 addSortedRange(m_rangesUnicode, unicodeCurr, hi);
155
156 if (m_isCaseInsensitive) {
157 // we're going to scan along, updating the start of the range
158 while (unicodeCurr <= hi) {
159 // Spin forwards over any characters that don't have two cases.
160 for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
161 // if this was the last character in the range, we're done.
162 if (unicodeCurr == hi)
163 return;
164 }
165 // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
166 UChar rangeStart = unicodeCurr;
167 UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
168
169 // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case,
170 // and if it's other case value is one greater then the othercase value for the current last
171 // character included in the range, we can include next into the range.
172 while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
173 // increment unicodeCurr; it points to the end of the range.
174 // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
175 ++unicodeCurr;
176 ++otherCurr;
177 }
178
179 // otherChar is the last in the range of other case chars, calculate offset to get back to the start.
180 addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
181
182 // unicodeCurr has been added, move on to the next char.
183 ++unicodeCurr;
184 }
185 }
186 }
187 } else if (ch == '-')
188 // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
189 m_isPendingDash = true;
190 else {
191 // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
192 flush();
193 m_charBuffer = ch;
194 }
195 } else
196 // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
197 // (the hyphen not treated as a special character in this case, same handling for any char).
198 m_charBuffer = ch;
199 }
200
201 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
202 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
203 // array (either ascii or unicode).
204 // If the pattern is case insensitive we add entries for both cases.
flush()205 void CharacterClassConstructor::flush()
206 {
207 if (m_charBuffer != -1) {
208 if (m_charBuffer <= 0x7f) {
209 if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
210 addSorted(m_matches, toASCIIUpper(m_charBuffer));
211 addSorted(m_matches, m_charBuffer);
212 if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
213 addSorted(m_matches, toASCIILower(m_charBuffer));
214 } else {
215 addSorted(m_matchesUnicode, m_charBuffer);
216 if (m_isCaseInsensitive) {
217 int other = jsc_pcre_ucp_othercase(m_charBuffer);
218 if (other != -1)
219 addSorted(m_matchesUnicode, other);
220 }
221 }
222 m_charBuffer = -1;
223 }
224
225 if (m_isPendingDash) {
226 addSorted(m_matches, '-');
227 m_isPendingDash = false;
228 }
229 }
230
append(const CharacterClass & other)231 void CharacterClassConstructor::append(const CharacterClass& other)
232 {
233 // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
234 // Need to check the spec, really, but think this matches PCRE behaviour.
235 flush();
236
237 if (other.numMatches) {
238 for (size_t i = 0; i < other.numMatches; ++i)
239 addSorted(m_matches, other.matches[i]);
240 }
241 if (other.numRanges) {
242 for (size_t i = 0; i < other.numRanges; ++i)
243 addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
244 }
245 if (other.numMatchesUnicode) {
246 for (size_t i = 0; i < other.numMatchesUnicode; ++i)
247 addSorted(m_matchesUnicode, other.matchesUnicode[i]);
248 }
249 if (other.numRangesUnicode) {
250 for (size_t i = 0; i < other.numRangesUnicode; ++i)
251 addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
252 }
253 }
254
255 } } // namespace JSC::WREC
256
257 #endif // ENABLE(WREC)
258