• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #ifndef HTMLTokenizer_h
28 #define HTMLTokenizer_h
29 
30 #include "SegmentedString.h"
31 #include <wtf/Noncopyable.h>
32 #include <wtf/PassOwnPtr.h>
33 #include <wtf/Vector.h>
34 #include <wtf/text/AtomicString.h>
35 
36 namespace WebCore {
37 
38 class Element;
39 class Frame;
40 class HTMLToken;
41 
42 class HTMLTokenizer {
43     WTF_MAKE_NONCOPYABLE(HTMLTokenizer); WTF_MAKE_FAST_ALLOCATED;
44 public:
45     enum State {
46         DataState,
47         CharacterReferenceInDataState,
48         RCDATAState,
49         CharacterReferenceInRCDATAState,
50         RAWTEXTState,
51         ScriptDataState,
52         PLAINTEXTState,
53         TagOpenState,
54         EndTagOpenState,
55         TagNameState,
56         RCDATALessThanSignState,
57         RCDATAEndTagOpenState,
58         RCDATAEndTagNameState,
59         RAWTEXTLessThanSignState,
60         RAWTEXTEndTagOpenState,
61         RAWTEXTEndTagNameState,
62         ScriptDataLessThanSignState,
63         ScriptDataEndTagOpenState,
64         ScriptDataEndTagNameState,
65         ScriptDataEscapeStartState,
66         ScriptDataEscapeStartDashState,
67         ScriptDataEscapedState,
68         ScriptDataEscapedDashState,
69         ScriptDataEscapedDashDashState,
70         ScriptDataEscapedLessThanSignState,
71         ScriptDataEscapedEndTagOpenState,
72         ScriptDataEscapedEndTagNameState,
73         ScriptDataDoubleEscapeStartState,
74         ScriptDataDoubleEscapedState,
75         ScriptDataDoubleEscapedDashState,
76         ScriptDataDoubleEscapedDashDashState,
77         ScriptDataDoubleEscapedLessThanSignState,
78         ScriptDataDoubleEscapeEndState,
79         BeforeAttributeNameState,
80         AttributeNameState,
81         AfterAttributeNameState,
82         BeforeAttributeValueState,
83         AttributeValueDoubleQuotedState,
84         AttributeValueSingleQuotedState,
85         AttributeValueUnquotedState,
86         CharacterReferenceInAttributeValueState,
87         AfterAttributeValueQuotedState,
88         SelfClosingStartTagState,
89         BogusCommentState,
90         // The ContinueBogusCommentState is not in the HTML5 spec, but we use
91         // it internally to keep track of whether we've started the bogus
92         // comment token yet.
93         ContinueBogusCommentState,
94         MarkupDeclarationOpenState,
95         CommentStartState,
96         CommentStartDashState,
97         CommentState,
98         CommentEndDashState,
99         CommentEndState,
100         CommentEndBangState,
101         DOCTYPEState,
102         BeforeDOCTYPENameState,
103         DOCTYPENameState,
104         AfterDOCTYPENameState,
105         AfterDOCTYPEPublicKeywordState,
106         BeforeDOCTYPEPublicIdentifierState,
107         DOCTYPEPublicIdentifierDoubleQuotedState,
108         DOCTYPEPublicIdentifierSingleQuotedState,
109         AfterDOCTYPEPublicIdentifierState,
110         BetweenDOCTYPEPublicAndSystemIdentifiersState,
111         AfterDOCTYPESystemKeywordState,
112         BeforeDOCTYPESystemIdentifierState,
113         DOCTYPESystemIdentifierDoubleQuotedState,
114         DOCTYPESystemIdentifierSingleQuotedState,
115         AfterDOCTYPESystemIdentifierState,
116         BogusDOCTYPEState,
117         CDATASectionState,
118         // These CDATA states are not in the HTML5 spec, but we use them internally.
119         CDATASectionRightSquareBracketState,
120         CDATASectionDoubleRightSquareBracketState,
121     };
122 
create(bool usePreHTML5ParserQuirks)123     static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
124     ~HTMLTokenizer();
125 
126     void reset();
127 
128     // This function returns true if it emits a token. Otherwise, callers
129     // must provide the same (in progress) token on the next call (unless
130     // they call reset() first).
131     bool nextToken(SegmentedString&, HTMLToken&);
132 
lineNumber()133     int lineNumber() const { return m_lineNumber; }
columnNumber()134     int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior.
135 
state()136     State state() const { return m_state; }
setState(State state)137     void setState(State state) { m_state = state; }
138 
139     // Updates the tokenizer's state according to the given tag name. This is
140     // an approximation of how the tree builder would update the tokenizer's
141     // state. This method is useful for approximating HTML tokenization. To
142     // get exactly the correct tokenization, you need the real tree builder.
143     //
144     // The main failures in the approximation are as follows:
145     //
146     //  * The first set of character tokens emitted for a <pre> element might
147     //    contain an extra leading newline.
148     //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
149     //    tree builder's insertion mode.
150     //  * CDATA sections in foreign content will be tokenized as bogus comments
151     //    instead of as character tokens.
152     //
153     void updateStateFor(const AtomicString& tagName, Frame*);
154 
155     // Hack to skip leading newline in <pre>/<listing> for authoring ease.
156     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
setSkipLeadingNewLineForListing(bool value)157     void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
158 
forceNullCharacterReplacement()159     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
setForceNullCharacterReplacement(bool value)160     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
161 
shouldAllowCDATA()162     bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
setShouldAllowCDATA(bool value)163     void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
164 
shouldSkipNullCharacters()165     bool shouldSkipNullCharacters() const
166     {
167         return !m_forceNullCharacterReplacement
168             && (m_state == DataState
169                 || m_state == RCDATAState
170                 || m_state == RAWTEXTState
171                 || m_state == PLAINTEXTState);
172     }
173 
174 private:
175     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
176     class InputStreamPreprocessor {
177         WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
178     public:
InputStreamPreprocessor(HTMLTokenizer * tokenizer)179         InputStreamPreprocessor(HTMLTokenizer* tokenizer)
180             : m_tokenizer(tokenizer)
181             , m_nextInputCharacter('\0')
182             , m_skipNextNewLine(false)
183         {
184         }
185 
nextInputCharacter()186         UChar nextInputCharacter() const { return m_nextInputCharacter; }
187 
188         // Returns whether we succeeded in peeking at the next character.
189         // The only way we can fail to peek is if there are no more
190         // characters in |source| (after collapsing \r\n, etc).
peek(SegmentedString & source,int & lineNumber)191         ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
192         {
193         PeekAgain:
194             m_nextInputCharacter = *source;
195 
196             // Every branch in this function is expensive, so we have a
197             // fast-reject branch for characters that don't require special
198             // handling. Please run the parser benchmark whenever you touch
199             // this function. It's very hot.
200             static const UChar specialCharacterMask = '\n' | '\r' | '\0';
201             if (m_nextInputCharacter & ~specialCharacterMask) {
202                 m_skipNextNewLine = false;
203                 return true;
204             }
205 
206             if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
207                 m_skipNextNewLine = false;
208                 source.advancePastNewline(lineNumber);
209                 if (source.isEmpty())
210                     return false;
211                 m_nextInputCharacter = *source;
212             }
213             if (m_nextInputCharacter == '\r') {
214                 m_nextInputCharacter = '\n';
215                 m_skipNextNewLine = true;
216             } else {
217                 m_skipNextNewLine = false;
218                 // FIXME: The spec indicates that the surrogate pair range as well as
219                 // a number of specific character values are parse errors and should be replaced
220                 // by the replacement character. We suspect this is a problem with the spec as doing
221                 // that filtering breaks surrogate pair handling and causes us not to match Minefield.
222                 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
223                     if (m_tokenizer->shouldSkipNullCharacters()) {
224                         source.advancePastNonNewline();
225                         if (source.isEmpty())
226                             return false;
227                         goto PeekAgain;
228                     }
229                     m_nextInputCharacter = 0xFFFD;
230                 }
231             }
232             return true;
233         }
234 
235         // Returns whether there are more characters in |source| after advancing.
advance(SegmentedString & source,int & lineNumber)236         bool advance(SegmentedString& source, int& lineNumber)
237         {
238             source.advance(lineNumber);
239             if (source.isEmpty())
240                 return false;
241             return peek(source, lineNumber);
242         }
243 
244         static const UChar endOfFileMarker;
245 
246     private:
shouldTreatNullAsEndOfFileMarker(SegmentedString & source)247         bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
248         {
249             return source.isClosed() && source.length() == 1;
250         }
251 
252         HTMLTokenizer* m_tokenizer;
253 
254         // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
255         UChar m_nextInputCharacter;
256         bool m_skipNextNewLine;
257     };
258 
259     HTMLTokenizer(bool usePreHTML5ParserQuirks);
260 
261     inline bool processEntity(SegmentedString&);
262 
263     inline void parseError();
264     inline void bufferCharacter(UChar);
265     inline void bufferCodePoint(unsigned);
266 
267     inline bool emitAndResumeIn(SegmentedString&, State);
268     inline bool emitAndReconsumeIn(SegmentedString&, State);
269     inline bool emitEndOfFile(SegmentedString&);
270     inline bool flushEmitAndResumeIn(SegmentedString&, State);
271 
272     // Return whether we need to emit a character token before dealing with
273     // the buffered end tag.
274     inline bool flushBufferedEndTag(SegmentedString&);
275     inline bool temporaryBufferIs(const String&);
276 
277     // Sometimes we speculatively consume input characters and we don't
278     // know whether they represent end tags or RCDATA, etc. These
279     // functions help manage these state.
280     inline void addToPossibleEndTag(UChar cc);
281     inline void saveEndTagNameIfNeeded();
282     inline bool isAppropriateEndTag();
283 
284     inline bool haveBufferedCharacterToken();
285 
286     State m_state;
287 
288     Vector<UChar, 32> m_appropriateEndTagName;
289 
290     // m_token is owned by the caller. If nextToken is not on the stack,
291     // this member might be pointing to unallocated memory.
292     HTMLToken* m_token;
293     int m_lineNumber;
294 
295     bool m_skipLeadingNewLineForListing;
296     bool m_forceNullCharacterReplacement;
297     bool m_shouldAllowCDATA;
298 
299     // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
300     Vector<UChar, 32> m_temporaryBuffer;
301 
302     // We occationally want to emit both a character token and an end tag
303     // token (e.g., when lexing script). We buffer the name of the end tag
304     // token here so we remember it next time we re-enter the tokenizer.
305     Vector<UChar, 32> m_bufferedEndTagName;
306 
307     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
308     UChar m_additionalAllowedCharacter;
309 
310     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
311     InputStreamPreprocessor m_inputStreamPreprocessor;
312 
313     bool m_usePreHTML5ParserQuirks;
314 };
315 
316 }
317 
318 #endif
319