• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #ifndef InputStreamPreprocessor_h
29 #define InputStreamPreprocessor_h
30 
31 #include "platform/text/SegmentedString.h"
32 #include "wtf/Noncopyable.h"
33 
34 namespace WebCore {
35 
36 const LChar kEndOfFileMarker = 0;
37 
38 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
39 template <typename Tokenizer>
40 class InputStreamPreprocessor {
41     WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
42 public:
InputStreamPreprocessor(Tokenizer * tokenizer)43     InputStreamPreprocessor(Tokenizer* tokenizer)
44         : m_tokenizer(tokenizer)
45     {
46         reset();
47     }
48 
nextInputCharacter()49     ALWAYS_INLINE UChar nextInputCharacter() const { return m_nextInputCharacter; }
50 
51     // Returns whether we succeeded in peeking at the next character.
52     // The only way we can fail to peek is if there are no more
53     // characters in |source| (after collapsing \r\n, etc).
peek(SegmentedString & source)54     ALWAYS_INLINE bool peek(SegmentedString& source)
55     {
56         m_nextInputCharacter = source.currentChar();
57 
58         // Every branch in this function is expensive, so we have a
59         // fast-reject branch for characters that don't require special
60         // handling. Please run the parser benchmark whenever you touch
61         // this function. It's very hot.
62         static const UChar specialCharacterMask = '\n' | '\r' | '\0';
63         if (m_nextInputCharacter & ~specialCharacterMask) {
64             m_skipNextNewLine = false;
65             return true;
66         }
67         return processNextInputCharacter(source);
68     }
69 
70     // Returns whether there are more characters in |source| after advancing.
advance(SegmentedString & source)71     ALWAYS_INLINE bool advance(SegmentedString& source)
72     {
73         source.advanceAndUpdateLineNumber();
74         if (source.isEmpty())
75             return false;
76         return peek(source);
77     }
78 
skipNextNewLine()79     bool skipNextNewLine() const { return m_skipNextNewLine; }
80 
81     void reset(bool skipNextNewLine = false)
82     {
83         m_nextInputCharacter = '\0';
84         m_skipNextNewLine = skipNextNewLine;
85     }
86 
87 private:
processNextInputCharacter(SegmentedString & source)88     bool processNextInputCharacter(SegmentedString& source)
89     {
90     ProcessAgain:
91         ASSERT(m_nextInputCharacter == source.currentChar());
92 
93         if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
94             m_skipNextNewLine = false;
95             source.advancePastNewlineAndUpdateLineNumber();
96             if (source.isEmpty())
97                 return false;
98             m_nextInputCharacter = source.currentChar();
99         }
100         if (m_nextInputCharacter == '\r') {
101             m_nextInputCharacter = '\n';
102             m_skipNextNewLine = true;
103         } else {
104             m_skipNextNewLine = false;
105             // FIXME: The spec indicates that the surrogate pair range as well as
106             // a number of specific character values are parse errors and should be replaced
107             // by the replacement character. We suspect this is a problem with the spec as doing
108             // that filtering breaks surrogate pair handling and causes us not to match Minefield.
109             if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
110                 if (m_tokenizer->shouldSkipNullCharacters()) {
111                     source.advancePastNonNewline();
112                     if (source.isEmpty())
113                         return false;
114                     m_nextInputCharacter = source.currentChar();
115                     goto ProcessAgain;
116                 }
117                 m_nextInputCharacter = 0xFFFD;
118             }
119         }
120         return true;
121     }
122 
shouldTreatNullAsEndOfFileMarker(SegmentedString & source)123     bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
124     {
125         return source.isClosed() && source.length() == 1;
126     }
127 
128     Tokenizer* m_tokenizer;
129 
130     // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
131     UChar m_nextInputCharacter;
132     bool m_skipNextNewLine;
133 };
134 
135 }
136 
137 #endif // InputStreamPreprocessor_h
138 
139