1 /*
2 * Copyright (C) 2008 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "WRECParser.h"
28
29 #if ENABLE(WREC)
30
31 #include "CharacterClassConstructor.h"
32 #include "WRECFunctors.h"
33
34 using namespace WTF;
35
36 namespace JSC { namespace WREC {
37
38 // These error messages match the error messages used by PCRE.
39 const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier";
40 const char* Parser::QuantifierWithoutAtom = "nothing to repeat";
41 const char* Parser::ParenthesesUnmatched = "unmatched parentheses";
42 const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?";
43 const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet.
44 const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class";
45 const char* Parser::CharacterClassOutOfOrder = "range out of order in character class";
46 const char* Parser::EscapeUnterminated = "\\ at end of pattern";
47
48 class PatternCharacterSequence {
49 typedef Generator::JumpList JumpList;
50
51 public:
PatternCharacterSequence(Generator & generator,JumpList & failures)52 PatternCharacterSequence(Generator& generator, JumpList& failures)
53 : m_generator(generator)
54 , m_failures(failures)
55 {
56 }
57
size()58 size_t size() { return m_sequence.size(); }
59
append(int ch)60 void append(int ch)
61 {
62 m_sequence.append(ch);
63 }
64
flush()65 void flush()
66 {
67 if (!m_sequence.size())
68 return;
69
70 m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size());
71 m_sequence.clear();
72 }
73
flush(const Quantifier & quantifier)74 void flush(const Quantifier& quantifier)
75 {
76 if (!m_sequence.size())
77 return;
78
79 m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1);
80
81 switch (quantifier.type) {
82 case Quantifier::None:
83 case Quantifier::Error:
84 ASSERT_NOT_REACHED();
85 break;
86
87 case Quantifier::Greedy: {
88 GeneratePatternCharacterFunctor functor(m_sequence.last());
89 m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
90 break;
91 }
92
93 case Quantifier::NonGreedy: {
94 GeneratePatternCharacterFunctor functor(m_sequence.last());
95 m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
96 break;
97 }
98 }
99
100 m_sequence.clear();
101 }
102
103 private:
104 Generator& m_generator;
105 JumpList& m_failures;
106 Vector<int, 8> m_sequence;
107 };
108
consumeGreedyQuantifier()109 ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier()
110 {
111 switch (peek()) {
112 case '?':
113 consume();
114 return Quantifier(Quantifier::Greedy, 0, 1);
115
116 case '*':
117 consume();
118 return Quantifier(Quantifier::Greedy, 0);
119
120 case '+':
121 consume();
122 return Quantifier(Quantifier::Greedy, 1);
123
124 case '{': {
125 SavedState state(*this);
126 consume();
127
128 // Accept: {n}, {n,}, {n,m}.
129 // Reject: {n,m} where n > m.
130 // Ignore: Anything else, such as {n, m}.
131
132 if (!peekIsDigit()) {
133 state.restore();
134 return Quantifier();
135 }
136
137 unsigned min = consumeNumber();
138 unsigned max = min;
139
140 if (peek() == ',') {
141 consume();
142 max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity;
143 }
144
145 if (peek() != '}') {
146 state.restore();
147 return Quantifier();
148 }
149 consume();
150
151 if (min > max) {
152 setError(QuantifierOutOfOrder);
153 return Quantifier(Quantifier::Error);
154 }
155
156 return Quantifier(Quantifier::Greedy, min, max);
157 }
158
159 default:
160 return Quantifier(); // No quantifier.
161 }
162 }
163
consumeQuantifier()164 Quantifier Parser::consumeQuantifier()
165 {
166 Quantifier q = consumeGreedyQuantifier();
167
168 if ((q.type == Quantifier::Greedy) && (peek() == '?')) {
169 consume();
170 q.type = Quantifier::NonGreedy;
171 }
172
173 return q;
174 }
175
parseCharacterClassQuantifier(JumpList & failures,const CharacterClass & charClass,bool invert)176 bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert)
177 {
178 Quantifier q = consumeQuantifier();
179
180 switch (q.type) {
181 case Quantifier::None: {
182 m_generator.generateCharacterClass(failures, charClass, invert);
183 break;
184 }
185
186 case Quantifier::Greedy: {
187 GenerateCharacterClassFunctor functor(&charClass, invert);
188 m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max);
189 break;
190 }
191
192 case Quantifier::NonGreedy: {
193 GenerateCharacterClassFunctor functor(&charClass, invert);
194 m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max);
195 break;
196 }
197
198 case Quantifier::Error:
199 return false;
200 }
201
202 return true;
203 }
204
parseBackreferenceQuantifier(JumpList & failures,unsigned subpatternId)205 bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId)
206 {
207 Quantifier q = consumeQuantifier();
208
209 switch (q.type) {
210 case Quantifier::None: {
211 m_generator.generateBackreference(failures, subpatternId);
212 break;
213 }
214
215 case Quantifier::Greedy:
216 case Quantifier::NonGreedy:
217 m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max);
218 return true;
219
220 case Quantifier::Error:
221 return false;
222 }
223
224 return true;
225 }
226
parseParentheses(JumpList & failures)227 bool Parser::parseParentheses(JumpList& failures)
228 {
229 ParenthesesType type = consumeParenthesesType();
230
231 // FIXME: WREC originally failed to backtrack correctly in cases such as
232 // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For
233 // unsupported parentheses, we fall back on PCRE.
234
235 switch (type) {
236 case Generator::Assertion: {
237 m_generator.generateParenthesesAssertion(failures);
238
239 if (consume() != ')') {
240 setError(ParenthesesUnmatched);
241 return false;
242 }
243
244 Quantifier quantifier = consumeQuantifier();
245 if (quantifier.type != Quantifier::None && quantifier.min == 0) {
246 setError(ParenthesesNotSupported);
247 return false;
248 }
249
250 return true;
251 }
252 case Generator::InvertedAssertion: {
253 m_generator.generateParenthesesInvertedAssertion(failures);
254
255 if (consume() != ')') {
256 setError(ParenthesesUnmatched);
257 return false;
258 }
259
260 Quantifier quantifier = consumeQuantifier();
261 if (quantifier.type != Quantifier::None && quantifier.min == 0) {
262 setError(ParenthesesNotSupported);
263 return false;
264 }
265
266 return true;
267 }
268 default:
269 setError(ParenthesesNotSupported);
270 return false;
271 }
272 }
273
parseCharacterClass(JumpList & failures)274 bool Parser::parseCharacterClass(JumpList& failures)
275 {
276 bool invert = false;
277 if (peek() == '^') {
278 consume();
279 invert = true;
280 }
281
282 CharacterClassConstructor constructor(m_ignoreCase);
283
284 int ch;
285 while ((ch = peek()) != ']') {
286 switch (ch) {
287 case EndOfPattern:
288 setError(CharacterClassUnmatched);
289 return false;
290
291 case '\\': {
292 consume();
293 Escape escape = consumeEscape(true);
294
295 switch (escape.type()) {
296 case Escape::PatternCharacter: {
297 int character = PatternCharacterEscape::cast(escape).character();
298 if (character == '-')
299 constructor.flushBeforeEscapedHyphen();
300 constructor.put(character);
301 break;
302 }
303 case Escape::CharacterClass: {
304 const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape);
305 ASSERT(!characterClassEscape.invert());
306 constructor.append(characterClassEscape.characterClass());
307 break;
308 }
309 case Escape::Error:
310 return false;
311 case Escape::Backreference:
312 case Escape::WordBoundaryAssertion: {
313 ASSERT_NOT_REACHED();
314 break;
315 }
316 }
317 break;
318 }
319
320 default:
321 consume();
322 constructor.put(ch);
323 }
324 }
325 consume();
326
327 // lazily catch reversed ranges ([z-a])in character classes
328 if (constructor.isUpsideDown()) {
329 setError(CharacterClassOutOfOrder);
330 return false;
331 }
332
333 constructor.flush();
334 CharacterClass charClass = constructor.charClass();
335 return parseCharacterClassQuantifier(failures, charClass, invert);
336 }
337
parseNonCharacterEscape(JumpList & failures,const Escape & escape)338 bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape)
339 {
340 switch (escape.type()) {
341 case Escape::PatternCharacter:
342 ASSERT_NOT_REACHED();
343 return false;
344
345 case Escape::CharacterClass:
346 return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert());
347
348 case Escape::Backreference:
349 return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId());
350
351 case Escape::WordBoundaryAssertion:
352 m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert());
353 return true;
354
355 case Escape::Error:
356 return false;
357 }
358
359 ASSERT_NOT_REACHED();
360 return false;
361 }
362
consumeEscape(bool inCharacterClass)363 Escape Parser::consumeEscape(bool inCharacterClass)
364 {
365 switch (peek()) {
366 case EndOfPattern:
367 setError(EscapeUnterminated);
368 return Escape(Escape::Error);
369
370 // Assertions
371 case 'b':
372 consume();
373 if (inCharacterClass)
374 return PatternCharacterEscape('\b');
375 return WordBoundaryAssertionEscape(false); // do not invert
376 case 'B':
377 consume();
378 if (inCharacterClass)
379 return PatternCharacterEscape('B');
380 return WordBoundaryAssertionEscape(true); // invert
381
382 // CharacterClassEscape
383 case 'd':
384 consume();
385 return CharacterClassEscape(CharacterClass::digits(), false);
386 case 's':
387 consume();
388 return CharacterClassEscape(CharacterClass::spaces(), false);
389 case 'w':
390 consume();
391 return CharacterClassEscape(CharacterClass::wordchar(), false);
392 case 'D':
393 consume();
394 return inCharacterClass
395 ? CharacterClassEscape(CharacterClass::nondigits(), false)
396 : CharacterClassEscape(CharacterClass::digits(), true);
397 case 'S':
398 consume();
399 return inCharacterClass
400 ? CharacterClassEscape(CharacterClass::nonspaces(), false)
401 : CharacterClassEscape(CharacterClass::spaces(), true);
402 case 'W':
403 consume();
404 return inCharacterClass
405 ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
406 : CharacterClassEscape(CharacterClass::wordchar(), true);
407
408 // DecimalEscape
409 case '1':
410 case '2':
411 case '3':
412 case '4':
413 case '5':
414 case '6':
415 case '7':
416 case '8':
417 case '9': {
418 if (peekDigit() > m_numSubpatterns || inCharacterClass) {
419 // To match Firefox, we parse an invalid backreference in the range [1-7]
420 // as an octal escape.
421 return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
422 }
423
424 int value = 0;
425 do {
426 unsigned newValue = value * 10 + peekDigit();
427 if (newValue > m_numSubpatterns)
428 break;
429 value = newValue;
430 consume();
431 } while (peekIsDigit());
432
433 return BackreferenceEscape(value);
434 }
435
436 // Octal escape
437 case '0':
438 consume();
439 return PatternCharacterEscape(consumeOctal());
440
441 // ControlEscape
442 case 'f':
443 consume();
444 return PatternCharacterEscape('\f');
445 case 'n':
446 consume();
447 return PatternCharacterEscape('\n');
448 case 'r':
449 consume();
450 return PatternCharacterEscape('\r');
451 case 't':
452 consume();
453 return PatternCharacterEscape('\t');
454 case 'v':
455 consume();
456 return PatternCharacterEscape('\v');
457
458 // ControlLetter
459 case 'c': {
460 SavedState state(*this);
461 consume();
462
463 int control = consume();
464 // To match Firefox, inside a character class, we also accept numbers
465 // and '_' as control characters.
466 if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) {
467 state.restore();
468 return PatternCharacterEscape('\\');
469 }
470 return PatternCharacterEscape(control & 31);
471 }
472
473 // HexEscape
474 case 'x': {
475 consume();
476
477 SavedState state(*this);
478 int x = consumeHex(2);
479 if (x == -1) {
480 state.restore();
481 return PatternCharacterEscape('x');
482 }
483 return PatternCharacterEscape(x);
484 }
485
486 // UnicodeEscape
487 case 'u': {
488 consume();
489
490 SavedState state(*this);
491 int x = consumeHex(4);
492 if (x == -1) {
493 state.restore();
494 return PatternCharacterEscape('u');
495 }
496 return PatternCharacterEscape(x);
497 }
498
499 // IdentityEscape
500 default:
501 return PatternCharacterEscape(consume());
502 }
503 }
504
parseAlternative(JumpList & failures)505 void Parser::parseAlternative(JumpList& failures)
506 {
507 PatternCharacterSequence sequence(m_generator, failures);
508
509 while (1) {
510 switch (peek()) {
511 case EndOfPattern:
512 case '|':
513 case ')':
514 sequence.flush();
515 return;
516
517 case '*':
518 case '+':
519 case '?':
520 case '{': {
521 Quantifier q = consumeQuantifier();
522
523 if (q.type == Quantifier::None) {
524 sequence.append(consume());
525 continue;
526 }
527
528 if (q.type == Quantifier::Error)
529 return;
530
531 if (!sequence.size()) {
532 setError(QuantifierWithoutAtom);
533 return;
534 }
535
536 sequence.flush(q);
537 continue;
538 }
539
540 case '^':
541 consume();
542
543 sequence.flush();
544 m_generator.generateAssertionBOL(failures);
545 continue;
546
547 case '$':
548 consume();
549
550 sequence.flush();
551 m_generator.generateAssertionEOL(failures);
552 continue;
553
554 case '.':
555 consume();
556
557 sequence.flush();
558 if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true))
559 return;
560 continue;
561
562 case '[':
563 consume();
564
565 sequence.flush();
566 if (!parseCharacterClass(failures))
567 return;
568 continue;
569
570 case '(':
571 consume();
572
573 sequence.flush();
574 if (!parseParentheses(failures))
575 return;
576 continue;
577
578 case '\\': {
579 consume();
580
581 Escape escape = consumeEscape(false);
582 if (escape.type() == Escape::PatternCharacter) {
583 sequence.append(PatternCharacterEscape::cast(escape).character());
584 continue;
585 }
586
587 sequence.flush();
588 if (!parseNonCharacterEscape(failures, escape))
589 return;
590 continue;
591 }
592
593 default:
594 sequence.append(consume());
595 continue;
596 }
597 }
598 }
599
600 /*
601 TOS holds index.
602 */
parseDisjunction(JumpList & failures)603 void Parser::parseDisjunction(JumpList& failures)
604 {
605 parseAlternative(failures);
606 if (peek() != '|')
607 return;
608
609 JumpList successes;
610 do {
611 consume();
612 m_generator.terminateAlternative(successes, failures);
613 parseAlternative(failures);
614 } while (peek() == '|');
615
616 m_generator.terminateDisjunction(successes);
617 }
618
consumeParenthesesType()619 Generator::ParenthesesType Parser::consumeParenthesesType()
620 {
621 if (peek() != '?')
622 return Generator::Capturing;
623 consume();
624
625 switch (consume()) {
626 case ':':
627 return Generator::NonCapturing;
628
629 case '=':
630 return Generator::Assertion;
631
632 case '!':
633 return Generator::InvertedAssertion;
634
635 default:
636 setError(ParenthesesTypeInvalid);
637 return Generator::Error;
638 }
639 }
640
641 } } // namespace JSC::WREC
642
643 #endif // ENABLE(WREC)
644