• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 #include "HTMLTokenizer.h"
30 
31 #include "HTMLEntityParser.h"
32 #include "HTMLToken.h"
33 #include "HTMLTreeBuilder.h"
34 #include "HTMLNames.h"
35 #include "NotImplemented.h"
36 #include <wtf/ASCIICType.h>
37 #include <wtf/CurrentTime.h>
38 #include <wtf/UnusedParam.h>
39 #include <wtf/text/AtomicString.h>
40 #include <wtf/text/CString.h>
41 #include <wtf/unicode/Unicode.h>
42 
43 using namespace WTF;
44 
45 namespace WebCore {
46 
47 using namespace HTMLNames;
48 
49 const UChar HTMLTokenizer::InputStreamPreprocessor::endOfFileMarker = 0;
50 
51 namespace {
52 
toLowerCase(UChar cc)53 inline UChar toLowerCase(UChar cc)
54 {
55     ASSERT(isASCIIUpper(cc));
56     const int lowerCaseOffset = 0x20;
57     return cc + lowerCaseOffset;
58 }
59 
isTokenizerWhitespace(UChar cc)60 inline bool isTokenizerWhitespace(UChar cc)
61 {
62     return cc == ' ' || cc == '\x0A' || cc == '\x09' || cc == '\x0C';
63 }
64 
advanceStringAndASSERTIgnoringCase(SegmentedString & source,const char * expectedCharacters)65 inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters)
66 {
67     while (*expectedCharacters)
68         source.advanceAndASSERTIgnoringCase(*expectedCharacters++);
69 }
70 
advanceStringAndASSERT(SegmentedString & source,const char * expectedCharacters)71 inline void advanceStringAndASSERT(SegmentedString& source, const char* expectedCharacters)
72 {
73     while (*expectedCharacters)
74         source.advanceAndASSERT(*expectedCharacters++);
75 }
76 
vectorEqualsString(const Vector<UChar,32> & vector,const String & string)77 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
78 {
79     if (vector.size() != string.length())
80         return false;
81     const UChar* stringData = string.characters();
82     const UChar* vectorData = vector.data();
83     // FIXME: Is there a higher-level function we should be calling here?
84     return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
85 }
86 
isEndTagBufferingState(HTMLTokenizer::State state)87 inline bool isEndTagBufferingState(HTMLTokenizer::State state)
88 {
89     switch (state) {
90     case HTMLTokenizer::RCDATAEndTagOpenState:
91     case HTMLTokenizer::RCDATAEndTagNameState:
92     case HTMLTokenizer::RAWTEXTEndTagOpenState:
93     case HTMLTokenizer::RAWTEXTEndTagNameState:
94     case HTMLTokenizer::ScriptDataEndTagOpenState:
95     case HTMLTokenizer::ScriptDataEndTagNameState:
96     case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
97     case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
98         return true;
99     default:
100         return false;
101     }
102 }
103 
104 }
105 
HTMLTokenizer(bool usePreHTML5ParserQuirks)106 HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks)
107     : m_inputStreamPreprocessor(this)
108     , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks)
109 {
110     reset();
111 }
112 
~HTMLTokenizer()113 HTMLTokenizer::~HTMLTokenizer()
114 {
115 }
116 
reset()117 void HTMLTokenizer::reset()
118 {
119     m_state = DataState;
120     m_token = 0;
121     m_lineNumber = 0;
122     m_skipLeadingNewLineForListing = false;
123     m_forceNullCharacterReplacement = false;
124     m_shouldAllowCDATA = false;
125     m_additionalAllowedCharacter = '\0';
126 }
127 
processEntity(SegmentedString & source)128 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
129 {
130     bool notEnoughCharacters = false;
131     Vector<UChar, 16> decodedEntity;
132     bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
133     if (notEnoughCharacters)
134         return false;
135     if (!success) {
136         ASSERT(decodedEntity.isEmpty());
137         bufferCharacter('&');
138     } else {
139         Vector<UChar>::const_iterator iter = decodedEntity.begin();
140         for (; iter != decodedEntity.end(); ++iter)
141             bufferCharacter(*iter);
142     }
143     return true;
144 }
145 
146 #if COMPILER(MSVC)
147 // We need to disable the "unreachable code" warning because we want to assert
148 // that some code points aren't reached in the state machine.
149 #pragma warning(disable: 4702)
150 #endif
151 
152 #define BEGIN_STATE(stateName) case stateName: stateName:
153 #define END_STATE() ASSERT_NOT_REACHED(); break;
154 
155 // We use this macro when the HTML5 spec says "reconsume the current input
156 // character in the <mumble> state."
157 #define RECONSUME_IN(stateName)                                            \
158     do {                                                                   \
159         m_state = stateName;                                               \
160         goto stateName;                                                    \
161     } while (false)
162 
163 // We use this macro when the HTML5 spec says "consume the next input
164 // character ... and switch to the <mumble> state."
165 #define ADVANCE_TO(stateName)                                              \
166     do {                                                                   \
167         m_state = stateName;                                               \
168         if (!m_inputStreamPreprocessor.advance(source, m_lineNumber))      \
169             return haveBufferedCharacterToken();                           \
170         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
171         goto stateName;                                                    \
172     } while (false)
173 
174 // Sometimes there's more complicated logic in the spec that separates when
175 // we consume the next input character and when we switch to a particular
176 // state. We handle those cases by advancing the source directly and using
177 // this macro to switch to the indicated state.
178 #define SWITCH_TO(stateName)                                               \
179     do {                                                                   \
180         m_state = stateName;                                               \
181         if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \
182             return haveBufferedCharacterToken();                           \
183         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
184         goto stateName;                                                    \
185     } while (false)
186 
187 
saveEndTagNameIfNeeded()188 inline void HTMLTokenizer::saveEndTagNameIfNeeded()
189 {
190     ASSERT(m_token->type() != HTMLToken::Uninitialized);
191     if (m_token->type() == HTMLToken::StartTag)
192         m_appropriateEndTagName = m_token->name();
193 }
194 
195 // We use this function when the HTML5 spec says "Emit the current <mumble>
196 // token. Switch to the <mumble> state."  We use the word "resume" instead of
197 // switch to indicate that this macro actually returns and that we'll end up
198 // in the state when we "resume" (i.e., are called again).
emitAndResumeIn(SegmentedString & source,State state)199 bool HTMLTokenizer::emitAndResumeIn(SegmentedString& source, State state)
200 {
201     m_state = state;
202     source.advance(m_lineNumber);
203     saveEndTagNameIfNeeded();
204     return true;
205 }
206 
207 // Identical to emitAndResumeIn, except does not advance.
emitAndReconsumeIn(SegmentedString &,State state)208 bool HTMLTokenizer::emitAndReconsumeIn(SegmentedString&, State state)
209 {
210     m_state = state;
211     saveEndTagNameIfNeeded();
212     return true;
213 }
214 
215 // Used to emit the EndOfFile token.
216 // Check if we have buffered characters to emit first before emitting the EOF.
emitEndOfFile(SegmentedString & source)217 bool HTMLTokenizer::emitEndOfFile(SegmentedString& source)
218 {
219     if (haveBufferedCharacterToken())
220         return true;
221     m_state = DataState;
222     source.advance(m_lineNumber);
223     m_token->clear();
224     m_token->makeEndOfFile();
225     return true;
226 }
227 
flushBufferedEndTag(SegmentedString & source)228 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
229 {
230     ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
231     source.advance(m_lineNumber);
232     if (m_token->type() == HTMLToken::Character)
233         return true;
234     m_token->beginEndTag(m_bufferedEndTagName);
235     m_bufferedEndTagName.clear();
236     return false;
237 }
238 
239 #define FLUSH_AND_ADVANCE_TO(stateName)                                    \
240     do {                                                                   \
241         m_state = stateName;                                               \
242         if (flushBufferedEndTag(source))                                   \
243             return true;                                                   \
244         if (source.isEmpty()                                               \
245             || !m_inputStreamPreprocessor.peek(source, m_lineNumber))      \
246             return haveBufferedCharacterToken();                           \
247         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
248         goto stateName;                                                    \
249     } while (false)
250 
flushEmitAndResumeIn(SegmentedString & source,State state)251 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, State state)
252 {
253     m_state = state;
254     flushBufferedEndTag(source);
255     return true;
256 }
257 
nextToken(SegmentedString & source,HTMLToken & token)258 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
259 {
260     // If we have a token in progress, then we're supposed to be called back
261     // with the same token so we can finish it.
262     ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
263     m_token = &token;
264 
265     if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
266         // FIXME: This should call flushBufferedEndTag().
267         // We started an end tag during our last iteration.
268         m_token->beginEndTag(m_bufferedEndTagName);
269         m_bufferedEndTagName.clear();
270         if (m_state == DataState) {
271             // We're back in the data state, so we must be done with the tag.
272             return true;
273         }
274     }
275 
276     if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
277         return haveBufferedCharacterToken();
278     UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
279 
280     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
281     // Note that this logic is different than the generic \r\n collapsing
282     // handled in the input stream preprocessor. This logic is here as an
283     // "authoring convenience" so folks can write:
284     //
285     // <pre>
286     // lorem ipsum
287     // lorem ipsum
288     // </pre>
289     //
290     // without getting an extra newline at the start of their <pre> element.
291     if (m_skipLeadingNewLineForListing) {
292         m_skipLeadingNewLineForListing = false;
293         if (cc == '\n') {
294             if (m_state == DataState)
295                 ADVANCE_TO(DataState);
296             if (m_state == RCDATAState)
297                 ADVANCE_TO(RCDATAState);
298             // When parsing text/plain documents, we run the tokenizer in the
299             // PLAINTEXTState and ignore m_skipLeadingNewLineForListing.
300             ASSERT(m_state == PLAINTEXTState);
301         }
302     }
303 
304     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
305     switch (m_state) {
306     BEGIN_STATE(DataState) {
307         if (cc == '&')
308             ADVANCE_TO(CharacterReferenceInDataState);
309         else if (cc == '<') {
310             if (m_token->type() == HTMLToken::Character) {
311                 // We have a bunch of character tokens queued up that we
312                 // are emitting lazily here.
313                 return true;
314             }
315             ADVANCE_TO(TagOpenState);
316         } else if (cc == InputStreamPreprocessor::endOfFileMarker)
317             return emitEndOfFile(source);
318         else {
319             bufferCharacter(cc);
320             ADVANCE_TO(DataState);
321         }
322     }
323     END_STATE()
324 
325     BEGIN_STATE(CharacterReferenceInDataState) {
326         if (!processEntity(source))
327             return haveBufferedCharacterToken();
328         SWITCH_TO(DataState);
329     }
330     END_STATE()
331 
332     BEGIN_STATE(RCDATAState) {
333         if (cc == '&')
334             ADVANCE_TO(CharacterReferenceInRCDATAState);
335         else if (cc == '<')
336             ADVANCE_TO(RCDATALessThanSignState);
337         else if (cc == InputStreamPreprocessor::endOfFileMarker)
338             return emitEndOfFile(source);
339         else {
340             bufferCharacter(cc);
341             ADVANCE_TO(RCDATAState);
342         }
343     }
344     END_STATE()
345 
346     BEGIN_STATE(CharacterReferenceInRCDATAState) {
347         if (!processEntity(source))
348             return haveBufferedCharacterToken();
349         SWITCH_TO(RCDATAState);
350     }
351     END_STATE()
352 
353     BEGIN_STATE(RAWTEXTState) {
354         if (cc == '<')
355             ADVANCE_TO(RAWTEXTLessThanSignState);
356         else if (cc == InputStreamPreprocessor::endOfFileMarker)
357             return emitEndOfFile(source);
358         else {
359             bufferCharacter(cc);
360             ADVANCE_TO(RAWTEXTState);
361         }
362     }
363     END_STATE()
364 
365     BEGIN_STATE(ScriptDataState) {
366         if (cc == '<')
367             ADVANCE_TO(ScriptDataLessThanSignState);
368         else if (cc == InputStreamPreprocessor::endOfFileMarker)
369             return emitEndOfFile(source);
370         else {
371             bufferCharacter(cc);
372             ADVANCE_TO(ScriptDataState);
373         }
374     }
375     END_STATE()
376 
377     BEGIN_STATE(PLAINTEXTState) {
378         if (cc == InputStreamPreprocessor::endOfFileMarker)
379             return emitEndOfFile(source);
380         else
381             bufferCharacter(cc);
382         ADVANCE_TO(PLAINTEXTState);
383     }
384     END_STATE()
385 
386     BEGIN_STATE(TagOpenState) {
387         if (cc == '!')
388             ADVANCE_TO(MarkupDeclarationOpenState);
389         else if (cc == '/')
390             ADVANCE_TO(EndTagOpenState);
391         else if (isASCIIUpper(cc)) {
392             m_token->beginStartTag(toLowerCase(cc));
393             ADVANCE_TO(TagNameState);
394         } else if (isASCIILower(cc)) {
395             m_token->beginStartTag(cc);
396             ADVANCE_TO(TagNameState);
397         } else if (cc == '?') {
398             parseError();
399             // The spec consumes the current character before switching
400             // to the bogus comment state, but it's easier to implement
401             // if we reconsume the current character.
402             RECONSUME_IN(BogusCommentState);
403         } else {
404             parseError();
405             bufferCharacter('<');
406             RECONSUME_IN(DataState);
407         }
408     }
409     END_STATE()
410 
411     BEGIN_STATE(EndTagOpenState) {
412         if (isASCIIUpper(cc)) {
413             m_token->beginEndTag(toLowerCase(cc));
414             ADVANCE_TO(TagNameState);
415         } else if (isASCIILower(cc)) {
416             m_token->beginEndTag(cc);
417             ADVANCE_TO(TagNameState);
418         } else if (cc == '>') {
419             parseError();
420             ADVANCE_TO(DataState);
421         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
422             parseError();
423             bufferCharacter('<');
424             bufferCharacter('/');
425             RECONSUME_IN(DataState);
426         } else {
427             parseError();
428             RECONSUME_IN(BogusCommentState);
429         }
430     }
431     END_STATE()
432 
433     BEGIN_STATE(TagNameState) {
434         if (isTokenizerWhitespace(cc))
435             ADVANCE_TO(BeforeAttributeNameState);
436         else if (cc == '/')
437             ADVANCE_TO(SelfClosingStartTagState);
438         else if (cc == '>')
439             return emitAndResumeIn(source, DataState);
440         else if (m_usePreHTML5ParserQuirks && cc == '<')
441             return emitAndReconsumeIn(source, DataState);
442         else if (isASCIIUpper(cc)) {
443             m_token->appendToName(toLowerCase(cc));
444             ADVANCE_TO(TagNameState);
445         } if (cc == InputStreamPreprocessor::endOfFileMarker) {
446             parseError();
447             RECONSUME_IN(DataState);
448         } else {
449             m_token->appendToName(cc);
450             ADVANCE_TO(TagNameState);
451         }
452     }
453     END_STATE()
454 
455     BEGIN_STATE(RCDATALessThanSignState) {
456         if (cc == '/') {
457             m_temporaryBuffer.clear();
458             ASSERT(m_bufferedEndTagName.isEmpty());
459             ADVANCE_TO(RCDATAEndTagOpenState);
460         } else {
461             bufferCharacter('<');
462             RECONSUME_IN(RCDATAState);
463         }
464     }
465     END_STATE()
466 
467     BEGIN_STATE(RCDATAEndTagOpenState) {
468         if (isASCIIUpper(cc)) {
469             m_temporaryBuffer.append(cc);
470             addToPossibleEndTag(toLowerCase(cc));
471             ADVANCE_TO(RCDATAEndTagNameState);
472         } else if (isASCIILower(cc)) {
473             m_temporaryBuffer.append(cc);
474             addToPossibleEndTag(cc);
475             ADVANCE_TO(RCDATAEndTagNameState);
476         } else {
477             bufferCharacter('<');
478             bufferCharacter('/');
479             RECONSUME_IN(RCDATAState);
480         }
481     }
482     END_STATE()
483 
484     BEGIN_STATE(RCDATAEndTagNameState) {
485         if (isASCIIUpper(cc)) {
486             m_temporaryBuffer.append(cc);
487             addToPossibleEndTag(toLowerCase(cc));
488             ADVANCE_TO(RCDATAEndTagNameState);
489         } else if (isASCIILower(cc)) {
490             m_temporaryBuffer.append(cc);
491             addToPossibleEndTag(cc);
492             ADVANCE_TO(RCDATAEndTagNameState);
493         } else {
494             if (isTokenizerWhitespace(cc)) {
495                 if (isAppropriateEndTag())
496                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
497             } else if (cc == '/') {
498                 if (isAppropriateEndTag())
499                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
500             } else if (cc == '>') {
501                 if (isAppropriateEndTag())
502                     return flushEmitAndResumeIn(source, DataState);
503             }
504             bufferCharacter('<');
505             bufferCharacter('/');
506             m_token->appendToCharacter(m_temporaryBuffer);
507             m_bufferedEndTagName.clear();
508             RECONSUME_IN(RCDATAState);
509         }
510     }
511     END_STATE()
512 
513     BEGIN_STATE(RAWTEXTLessThanSignState) {
514         if (cc == '/') {
515             m_temporaryBuffer.clear();
516             ASSERT(m_bufferedEndTagName.isEmpty());
517             ADVANCE_TO(RAWTEXTEndTagOpenState);
518         } else {
519             bufferCharacter('<');
520             RECONSUME_IN(RAWTEXTState);
521         }
522     }
523     END_STATE()
524 
525     BEGIN_STATE(RAWTEXTEndTagOpenState) {
526         if (isASCIIUpper(cc)) {
527             m_temporaryBuffer.append(cc);
528             addToPossibleEndTag(toLowerCase(cc));
529             ADVANCE_TO(RAWTEXTEndTagNameState);
530         } else if (isASCIILower(cc)) {
531             m_temporaryBuffer.append(cc);
532             addToPossibleEndTag(cc);
533             ADVANCE_TO(RAWTEXTEndTagNameState);
534         } else {
535             bufferCharacter('<');
536             bufferCharacter('/');
537             RECONSUME_IN(RAWTEXTState);
538         }
539     }
540     END_STATE()
541 
542     BEGIN_STATE(RAWTEXTEndTagNameState) {
543         if (isASCIIUpper(cc)) {
544             m_temporaryBuffer.append(cc);
545             addToPossibleEndTag(toLowerCase(cc));
546             ADVANCE_TO(RAWTEXTEndTagNameState);
547         } else if (isASCIILower(cc)) {
548             m_temporaryBuffer.append(cc);
549             addToPossibleEndTag(cc);
550             ADVANCE_TO(RAWTEXTEndTagNameState);
551         } else {
552             if (isTokenizerWhitespace(cc)) {
553                 if (isAppropriateEndTag())
554                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
555             } else if (cc == '/') {
556                 if (isAppropriateEndTag())
557                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
558             } else if (cc == '>') {
559                 if (isAppropriateEndTag())
560                     return flushEmitAndResumeIn(source, DataState);
561             }
562             bufferCharacter('<');
563             bufferCharacter('/');
564             m_token->appendToCharacter(m_temporaryBuffer);
565             m_bufferedEndTagName.clear();
566             RECONSUME_IN(RAWTEXTState);
567         }
568     }
569     END_STATE()
570 
571     BEGIN_STATE(ScriptDataLessThanSignState) {
572         if (cc == '/') {
573             m_temporaryBuffer.clear();
574             ASSERT(m_bufferedEndTagName.isEmpty());
575             ADVANCE_TO(ScriptDataEndTagOpenState);
576         } else if (cc == '!') {
577             bufferCharacter('<');
578             bufferCharacter('!');
579             ADVANCE_TO(ScriptDataEscapeStartState);
580         } else {
581             bufferCharacter('<');
582             RECONSUME_IN(ScriptDataState);
583         }
584     }
585     END_STATE()
586 
587     BEGIN_STATE(ScriptDataEndTagOpenState) {
588         if (isASCIIUpper(cc)) {
589             m_temporaryBuffer.append(cc);
590             addToPossibleEndTag(toLowerCase(cc));
591             ADVANCE_TO(ScriptDataEndTagNameState);
592         } else if (isASCIILower(cc)) {
593             m_temporaryBuffer.append(cc);
594             addToPossibleEndTag(cc);
595             ADVANCE_TO(ScriptDataEndTagNameState);
596         } else {
597             bufferCharacter('<');
598             bufferCharacter('/');
599             RECONSUME_IN(ScriptDataState);
600         }
601     }
602     END_STATE()
603 
604     BEGIN_STATE(ScriptDataEndTagNameState) {
605         if (isASCIIUpper(cc)) {
606             m_temporaryBuffer.append(cc);
607             addToPossibleEndTag(toLowerCase(cc));
608             ADVANCE_TO(ScriptDataEndTagNameState);
609         } else if (isASCIILower(cc)) {
610             m_temporaryBuffer.append(cc);
611             addToPossibleEndTag(cc);
612             ADVANCE_TO(ScriptDataEndTagNameState);
613         } else {
614             if (isTokenizerWhitespace(cc)) {
615                 if (isAppropriateEndTag())
616                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
617             } else if (cc == '/') {
618                 if (isAppropriateEndTag())
619                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
620             } else if (cc == '>') {
621                 if (isAppropriateEndTag())
622                     return flushEmitAndResumeIn(source, DataState);
623             }
624             bufferCharacter('<');
625             bufferCharacter('/');
626             m_token->appendToCharacter(m_temporaryBuffer);
627             m_bufferedEndTagName.clear();
628             RECONSUME_IN(ScriptDataState);
629         }
630     }
631     END_STATE()
632 
633     BEGIN_STATE(ScriptDataEscapeStartState) {
634         if (cc == '-') {
635             bufferCharacter(cc);
636             ADVANCE_TO(ScriptDataEscapeStartDashState);
637         } else
638             RECONSUME_IN(ScriptDataState);
639     }
640     END_STATE()
641 
642     BEGIN_STATE(ScriptDataEscapeStartDashState) {
643         if (cc == '-') {
644             bufferCharacter(cc);
645             ADVANCE_TO(ScriptDataEscapedDashDashState);
646         } else
647             RECONSUME_IN(ScriptDataState);
648     }
649     END_STATE()
650 
651     BEGIN_STATE(ScriptDataEscapedState) {
652         if (cc == '-') {
653             bufferCharacter(cc);
654             ADVANCE_TO(ScriptDataEscapedDashState);
655         } else if (cc == '<')
656             ADVANCE_TO(ScriptDataEscapedLessThanSignState);
657         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
658             parseError();
659             RECONSUME_IN(DataState);
660         } else {
661             bufferCharacter(cc);
662             ADVANCE_TO(ScriptDataEscapedState);
663         }
664     }
665     END_STATE()
666 
667     BEGIN_STATE(ScriptDataEscapedDashState) {
668         if (cc == '-') {
669             bufferCharacter(cc);
670             ADVANCE_TO(ScriptDataEscapedDashDashState);
671         } else if (cc == '<')
672             ADVANCE_TO(ScriptDataEscapedLessThanSignState);
673         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
674             parseError();
675             RECONSUME_IN(DataState);
676         } else {
677             bufferCharacter(cc);
678             ADVANCE_TO(ScriptDataEscapedState);
679         }
680     }
681     END_STATE()
682 
683     BEGIN_STATE(ScriptDataEscapedDashDashState) {
684         if (cc == '-') {
685             bufferCharacter(cc);
686             ADVANCE_TO(ScriptDataEscapedDashDashState);
687         } else if (cc == '<')
688             ADVANCE_TO(ScriptDataEscapedLessThanSignState);
689         else if (cc == '>') {
690             bufferCharacter(cc);
691             ADVANCE_TO(ScriptDataState);
692         } if (cc == InputStreamPreprocessor::endOfFileMarker) {
693             parseError();
694             RECONSUME_IN(DataState);
695         } else {
696             bufferCharacter(cc);
697             ADVANCE_TO(ScriptDataEscapedState);
698         }
699     }
700     END_STATE()
701 
702     BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
703         if (cc == '/') {
704             m_temporaryBuffer.clear();
705             ASSERT(m_bufferedEndTagName.isEmpty());
706             ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
707         } else if (isASCIIUpper(cc)) {
708             bufferCharacter('<');
709             bufferCharacter(cc);
710             m_temporaryBuffer.clear();
711             m_temporaryBuffer.append(toLowerCase(cc));
712             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
713         } else if (isASCIILower(cc)) {
714             bufferCharacter('<');
715             bufferCharacter(cc);
716             m_temporaryBuffer.clear();
717             m_temporaryBuffer.append(cc);
718             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
719         } else {
720             bufferCharacter('<');
721             RECONSUME_IN(ScriptDataEscapedState);
722         }
723     }
724     END_STATE()
725 
726     BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
727         if (isASCIIUpper(cc)) {
728             m_temporaryBuffer.append(cc);
729             addToPossibleEndTag(toLowerCase(cc));
730             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
731         } else if (isASCIILower(cc)) {
732             m_temporaryBuffer.append(cc);
733             addToPossibleEndTag(cc);
734             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
735         } else {
736             bufferCharacter('<');
737             bufferCharacter('/');
738             RECONSUME_IN(ScriptDataEscapedState);
739         }
740     }
741     END_STATE()
742 
743     BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
744         if (isASCIIUpper(cc)) {
745             m_temporaryBuffer.append(cc);
746             addToPossibleEndTag(toLowerCase(cc));
747             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
748         } else if (isASCIILower(cc)) {
749             m_temporaryBuffer.append(cc);
750             addToPossibleEndTag(cc);
751             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
752         } else {
753             if (isTokenizerWhitespace(cc)) {
754                 if (isAppropriateEndTag())
755                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
756             } else if (cc == '/') {
757                 if (isAppropriateEndTag())
758                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
759             } else if (cc == '>') {
760                 if (isAppropriateEndTag())
761                     return flushEmitAndResumeIn(source, DataState);
762             }
763             bufferCharacter('<');
764             bufferCharacter('/');
765             m_token->appendToCharacter(m_temporaryBuffer);
766             m_bufferedEndTagName.clear();
767             RECONSUME_IN(ScriptDataEscapedState);
768         }
769     }
770     END_STATE()
771 
772     BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
773         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
774             bufferCharacter(cc);
775             if (temporaryBufferIs(scriptTag.localName()))
776                 ADVANCE_TO(ScriptDataDoubleEscapedState);
777             else
778                 ADVANCE_TO(ScriptDataEscapedState);
779         } else if (isASCIIUpper(cc)) {
780             bufferCharacter(cc);
781             m_temporaryBuffer.append(toLowerCase(cc));
782             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
783         } else if (isASCIILower(cc)) {
784             bufferCharacter(cc);
785             m_temporaryBuffer.append(cc);
786             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
787         } else
788             RECONSUME_IN(ScriptDataEscapedState);
789     }
790     END_STATE()
791 
792     BEGIN_STATE(ScriptDataDoubleEscapedState) {
793         if (cc == '-') {
794             bufferCharacter(cc);
795             ADVANCE_TO(ScriptDataDoubleEscapedDashState);
796         } else if (cc == '<') {
797             bufferCharacter(cc);
798             ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
799         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
800             parseError();
801             RECONSUME_IN(DataState);
802         } else {
803             bufferCharacter(cc);
804             ADVANCE_TO(ScriptDataDoubleEscapedState);
805         }
806     }
807     END_STATE()
808 
809     BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
810         if (cc == '-') {
811             bufferCharacter(cc);
812             ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
813         } else if (cc == '<') {
814             bufferCharacter(cc);
815             ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
816         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
817             parseError();
818             RECONSUME_IN(DataState);
819         } else {
820             bufferCharacter(cc);
821             ADVANCE_TO(ScriptDataDoubleEscapedState);
822         }
823     }
824     END_STATE()
825 
826     BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
827         if (cc == '-') {
828             bufferCharacter(cc);
829             ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
830         } else if (cc == '<') {
831             bufferCharacter(cc);
832             ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
833         } else if (cc == '>') {
834             bufferCharacter(cc);
835             ADVANCE_TO(ScriptDataState);
836         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
837             parseError();
838             RECONSUME_IN(DataState);
839         } else {
840             bufferCharacter(cc);
841             ADVANCE_TO(ScriptDataDoubleEscapedState);
842         }
843     }
844     END_STATE()
845 
846     BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
847         if (cc == '/') {
848             bufferCharacter(cc);
849             m_temporaryBuffer.clear();
850             ADVANCE_TO(ScriptDataDoubleEscapeEndState);
851         } else
852             RECONSUME_IN(ScriptDataDoubleEscapedState);
853     }
854     END_STATE()
855 
856     BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
857         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
858             bufferCharacter(cc);
859             if (temporaryBufferIs(scriptTag.localName()))
860                 ADVANCE_TO(ScriptDataEscapedState);
861             else
862                 ADVANCE_TO(ScriptDataDoubleEscapedState);
863         } else if (isASCIIUpper(cc)) {
864             bufferCharacter(cc);
865             m_temporaryBuffer.append(toLowerCase(cc));
866             ADVANCE_TO(ScriptDataDoubleEscapeEndState);
867         } else if (isASCIILower(cc)) {
868             bufferCharacter(cc);
869             m_temporaryBuffer.append(cc);
870             ADVANCE_TO(ScriptDataDoubleEscapeEndState);
871         } else
872             RECONSUME_IN(ScriptDataDoubleEscapedState);
873     }
874     END_STATE()
875 
876     BEGIN_STATE(BeforeAttributeNameState) {
877         if (isTokenizerWhitespace(cc))
878             ADVANCE_TO(BeforeAttributeNameState);
879         else if (cc == '/')
880             ADVANCE_TO(SelfClosingStartTagState);
881         else if (cc == '>')
882             return emitAndResumeIn(source, DataState);
883         else if (m_usePreHTML5ParserQuirks && cc == '<')
884             return emitAndReconsumeIn(source, DataState);
885         else if (isASCIIUpper(cc)) {
886             m_token->addNewAttribute();
887             m_token->beginAttributeName(source.numberOfCharactersConsumed());
888             m_token->appendToAttributeName(toLowerCase(cc));
889             ADVANCE_TO(AttributeNameState);
890         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
891             parseError();
892             RECONSUME_IN(DataState);
893         } else {
894             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
895                 parseError();
896             m_token->addNewAttribute();
897             m_token->beginAttributeName(source.numberOfCharactersConsumed());
898             m_token->appendToAttributeName(cc);
899             ADVANCE_TO(AttributeNameState);
900         }
901     }
902     END_STATE()
903 
904     BEGIN_STATE(AttributeNameState) {
905         if (isTokenizerWhitespace(cc)) {
906             m_token->endAttributeName(source.numberOfCharactersConsumed());
907             ADVANCE_TO(AfterAttributeNameState);
908         } else if (cc == '/') {
909             m_token->endAttributeName(source.numberOfCharactersConsumed());
910             ADVANCE_TO(SelfClosingStartTagState);
911         } else if (cc == '=') {
912             m_token->endAttributeName(source.numberOfCharactersConsumed());
913             ADVANCE_TO(BeforeAttributeValueState);
914         } else if (cc == '>') {
915             m_token->endAttributeName(source.numberOfCharactersConsumed());
916             return emitAndResumeIn(source, DataState);
917         } else if (m_usePreHTML5ParserQuirks && cc == '<') {
918             m_token->endAttributeName(source.numberOfCharactersConsumed());
919             return emitAndReconsumeIn(source, DataState);
920         } else if (isASCIIUpper(cc)) {
921             m_token->appendToAttributeName(toLowerCase(cc));
922             ADVANCE_TO(AttributeNameState);
923         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
924             parseError();
925             m_token->endAttributeName(source.numberOfCharactersConsumed());
926             RECONSUME_IN(DataState);
927         } else {
928             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
929                 parseError();
930             m_token->appendToAttributeName(cc);
931             ADVANCE_TO(AttributeNameState);
932         }
933     }
934     END_STATE()
935 
936     BEGIN_STATE(AfterAttributeNameState) {
937         if (isTokenizerWhitespace(cc))
938             ADVANCE_TO(AfterAttributeNameState);
939         else if (cc == '/')
940             ADVANCE_TO(SelfClosingStartTagState);
941         else if (cc == '=')
942             ADVANCE_TO(BeforeAttributeValueState);
943         else if (cc == '>')
944             return emitAndResumeIn(source, DataState);
945         else if (m_usePreHTML5ParserQuirks && cc == '<')
946             return emitAndReconsumeIn(source, DataState);
947         else if (isASCIIUpper(cc)) {
948             m_token->addNewAttribute();
949             m_token->beginAttributeName(source.numberOfCharactersConsumed());
950             m_token->appendToAttributeName(toLowerCase(cc));
951             ADVANCE_TO(AttributeNameState);
952         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
953             parseError();
954             RECONSUME_IN(DataState);
955         } else {
956             if (cc == '"' || cc == '\'' || cc == '<')
957                 parseError();
958             m_token->addNewAttribute();
959             m_token->beginAttributeName(source.numberOfCharactersConsumed());
960             m_token->appendToAttributeName(cc);
961             ADVANCE_TO(AttributeNameState);
962         }
963     }
964     END_STATE()
965 
966     BEGIN_STATE(BeforeAttributeValueState) {
967         if (isTokenizerWhitespace(cc))
968             ADVANCE_TO(BeforeAttributeValueState);
969         else if (cc == '"') {
970             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
971             ADVANCE_TO(AttributeValueDoubleQuotedState);
972         } else if (cc == '&') {
973             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
974             RECONSUME_IN(AttributeValueUnquotedState);
975         } else if (cc == '\'') {
976             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
977             ADVANCE_TO(AttributeValueSingleQuotedState);
978         } else if (cc == '>') {
979             parseError();
980             return emitAndResumeIn(source, DataState);
981         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
982             parseError();
983             RECONSUME_IN(DataState);
984         } else {
985             if (cc == '<' || cc == '=' || cc == '`')
986                 parseError();
987             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
988             m_token->appendToAttributeValue(cc);
989             ADVANCE_TO(AttributeValueUnquotedState);
990         }
991     }
992     END_STATE()
993 
994     BEGIN_STATE(AttributeValueDoubleQuotedState) {
995         if (cc == '"') {
996             m_token->endAttributeValue(source.numberOfCharactersConsumed());
997             ADVANCE_TO(AfterAttributeValueQuotedState);
998         } else if (cc == '&') {
999             m_additionalAllowedCharacter = '"';
1000             ADVANCE_TO(CharacterReferenceInAttributeValueState);
1001         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1002             parseError();
1003             m_token->endAttributeValue(source.numberOfCharactersConsumed());
1004             RECONSUME_IN(DataState);
1005         } else {
1006             m_token->appendToAttributeValue(cc);
1007             ADVANCE_TO(AttributeValueDoubleQuotedState);
1008         }
1009     }
1010     END_STATE()
1011 
1012     BEGIN_STATE(AttributeValueSingleQuotedState) {
1013         if (cc == '\'') {
1014             m_token->endAttributeValue(source.numberOfCharactersConsumed());
1015             ADVANCE_TO(AfterAttributeValueQuotedState);
1016         } else if (cc == '&') {
1017             m_additionalAllowedCharacter = '\'';
1018             ADVANCE_TO(CharacterReferenceInAttributeValueState);
1019         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1020             parseError();
1021             m_token->endAttributeValue(source.numberOfCharactersConsumed());
1022             RECONSUME_IN(DataState);
1023         } else {
1024             m_token->appendToAttributeValue(cc);
1025             ADVANCE_TO(AttributeValueSingleQuotedState);
1026         }
1027     }
1028     END_STATE()
1029 
1030     BEGIN_STATE(AttributeValueUnquotedState) {
1031         if (isTokenizerWhitespace(cc)) {
1032             m_token->endAttributeValue(source.numberOfCharactersConsumed());
1033             ADVANCE_TO(BeforeAttributeNameState);
1034         } else if (cc == '&') {
1035             m_additionalAllowedCharacter = '>';
1036             ADVANCE_TO(CharacterReferenceInAttributeValueState);
1037         } else if (cc == '>') {
1038             m_token->endAttributeValue(source.numberOfCharactersConsumed());
1039             return emitAndResumeIn(source, DataState);
1040         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1041             parseError();
1042             m_token->endAttributeValue(source.numberOfCharactersConsumed());
1043             RECONSUME_IN(DataState);
1044         } else {
1045             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
1046                 parseError();
1047             m_token->appendToAttributeValue(cc);
1048             ADVANCE_TO(AttributeValueUnquotedState);
1049         }
1050     }
1051     END_STATE()
1052 
1053     BEGIN_STATE(CharacterReferenceInAttributeValueState) {
1054         bool notEnoughCharacters = false;
1055         Vector<UChar, 16> decodedEntity;
1056         bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
1057         if (notEnoughCharacters)
1058             return haveBufferedCharacterToken();
1059         if (!success) {
1060             ASSERT(decodedEntity.isEmpty());
1061             m_token->appendToAttributeValue('&');
1062         } else {
1063             Vector<UChar>::const_iterator iter = decodedEntity.begin();
1064             for (; iter != decodedEntity.end(); ++iter)
1065                 m_token->appendToAttributeValue(*iter);
1066         }
1067         // We're supposed to switch back to the attribute value state that
1068         // we were in when we were switched into this state. Rather than
1069         // keeping track of this explictly, we observe that the previous
1070         // state can be determined by m_additionalAllowedCharacter.
1071         if (m_additionalAllowedCharacter == '"')
1072             SWITCH_TO(AttributeValueDoubleQuotedState);
1073         else if (m_additionalAllowedCharacter == '\'')
1074             SWITCH_TO(AttributeValueSingleQuotedState);
1075         else if (m_additionalAllowedCharacter == '>')
1076             SWITCH_TO(AttributeValueUnquotedState);
1077         else
1078             ASSERT_NOT_REACHED();
1079     }
1080     END_STATE()
1081 
1082     BEGIN_STATE(AfterAttributeValueQuotedState) {
1083         if (isTokenizerWhitespace(cc))
1084             ADVANCE_TO(BeforeAttributeNameState);
1085         else if (cc == '/')
1086             ADVANCE_TO(SelfClosingStartTagState);
1087         else if (cc == '>')
1088             return emitAndResumeIn(source, DataState);
1089         else if (m_usePreHTML5ParserQuirks && cc == '<')
1090             return emitAndReconsumeIn(source, DataState);
1091         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1092             parseError();
1093             RECONSUME_IN(DataState);
1094         } else {
1095             parseError();
1096             RECONSUME_IN(BeforeAttributeNameState);
1097         }
1098     }
1099     END_STATE()
1100 
1101     BEGIN_STATE(SelfClosingStartTagState) {
1102         if (cc == '>') {
1103             m_token->setSelfClosing();
1104             return emitAndResumeIn(source, DataState);
1105         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1106             parseError();
1107             RECONSUME_IN(DataState);
1108         } else {
1109             parseError();
1110             RECONSUME_IN(BeforeAttributeNameState);
1111         }
1112     }
1113     END_STATE()
1114 
1115     BEGIN_STATE(BogusCommentState) {
1116         m_token->beginComment();
1117         RECONSUME_IN(ContinueBogusCommentState);
1118     }
1119     END_STATE()
1120 
1121     BEGIN_STATE(ContinueBogusCommentState) {
1122         if (cc == '>')
1123             return emitAndResumeIn(source, DataState);
1124         else if (cc == InputStreamPreprocessor::endOfFileMarker)
1125             return emitAndReconsumeIn(source, DataState);
1126         else {
1127             m_token->appendToComment(cc);
1128             ADVANCE_TO(ContinueBogusCommentState);
1129         }
1130     }
1131     END_STATE()
1132 
1133     BEGIN_STATE(MarkupDeclarationOpenState) {
1134         DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
1135         DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
1136         DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA["));
1137         if (cc == '-') {
1138             SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1139             if (result == SegmentedString::DidMatch) {
1140                 source.advanceAndASSERT('-');
1141                 source.advanceAndASSERT('-');
1142                 m_token->beginComment();
1143                 SWITCH_TO(CommentStartState);
1144             } else if (result == SegmentedString::NotEnoughCharacters)
1145                 return haveBufferedCharacterToken();
1146         } else if (cc == 'D' || cc == 'd') {
1147             SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1148             if (result == SegmentedString::DidMatch) {
1149                 advanceStringAndASSERTIgnoringCase(source, "doctype");
1150                 SWITCH_TO(DOCTYPEState);
1151             } else if (result == SegmentedString::NotEnoughCharacters)
1152                 return haveBufferedCharacterToken();
1153         } else if (cc == '[' && shouldAllowCDATA()) {
1154             SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
1155             if (result == SegmentedString::DidMatch) {
1156                 advanceStringAndASSERT(source, "[CDATA[");
1157                 SWITCH_TO(CDATASectionState);
1158             } else if (result == SegmentedString::NotEnoughCharacters)
1159                 return haveBufferedCharacterToken();
1160         }
1161         parseError();
1162         RECONSUME_IN(BogusCommentState);
1163     }
1164     END_STATE()
1165 
1166     BEGIN_STATE(CommentStartState) {
1167         if (cc == '-')
1168             ADVANCE_TO(CommentStartDashState);
1169         else if (cc == '>') {
1170             parseError();
1171             return emitAndResumeIn(source, DataState);
1172         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1173             parseError();
1174             return emitAndReconsumeIn(source, DataState);
1175         } else {
1176             m_token->appendToComment(cc);
1177             ADVANCE_TO(CommentState);
1178         }
1179     }
1180     END_STATE()
1181 
1182     BEGIN_STATE(CommentStartDashState) {
1183         if (cc == '-')
1184             ADVANCE_TO(CommentEndState);
1185         else if (cc == '>') {
1186             parseError();
1187             return emitAndResumeIn(source, DataState);
1188         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1189             parseError();
1190             return emitAndReconsumeIn(source, DataState);
1191         } else {
1192             m_token->appendToComment('-');
1193             m_token->appendToComment(cc);
1194             ADVANCE_TO(CommentState);
1195         }
1196     }
1197     END_STATE()
1198 
1199     BEGIN_STATE(CommentState) {
1200         if (cc == '-')
1201             ADVANCE_TO(CommentEndDashState);
1202         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1203             parseError();
1204             return emitAndReconsumeIn(source, DataState);
1205         } else {
1206             m_token->appendToComment(cc);
1207             ADVANCE_TO(CommentState);
1208         }
1209     }
1210     END_STATE()
1211 
1212     BEGIN_STATE(CommentEndDashState) {
1213         if (cc == '-')
1214             ADVANCE_TO(CommentEndState);
1215         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1216             parseError();
1217             return emitAndReconsumeIn(source, DataState);
1218         } else {
1219             m_token->appendToComment('-');
1220             m_token->appendToComment(cc);
1221             ADVANCE_TO(CommentState);
1222         }
1223     }
1224     END_STATE()
1225 
1226     BEGIN_STATE(CommentEndState) {
1227         if (cc == '>')
1228             return emitAndResumeIn(source, DataState);
1229         else if (cc == '!') {
1230             parseError();
1231             ADVANCE_TO(CommentEndBangState);
1232         } else if (cc == '-') {
1233             parseError();
1234             m_token->appendToComment('-');
1235             ADVANCE_TO(CommentEndState);
1236         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1237             parseError();
1238             return emitAndReconsumeIn(source, DataState);
1239         } else {
1240             parseError();
1241             m_token->appendToComment('-');
1242             m_token->appendToComment('-');
1243             m_token->appendToComment(cc);
1244             ADVANCE_TO(CommentState);
1245         }
1246     }
1247     END_STATE()
1248 
1249     BEGIN_STATE(CommentEndBangState) {
1250         if (cc == '-') {
1251             m_token->appendToComment('-');
1252             m_token->appendToComment('-');
1253             m_token->appendToComment('!');
1254             ADVANCE_TO(CommentEndDashState);
1255         } else if (cc == '>')
1256             return emitAndResumeIn(source, DataState);
1257         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1258             parseError();
1259             return emitAndReconsumeIn(source, DataState);
1260         } else {
1261             m_token->appendToComment('-');
1262             m_token->appendToComment('-');
1263             m_token->appendToComment('!');
1264             m_token->appendToComment(cc);
1265             ADVANCE_TO(CommentState);
1266         }
1267     }
1268     END_STATE()
1269 
1270     BEGIN_STATE(DOCTYPEState) {
1271         if (isTokenizerWhitespace(cc))
1272             ADVANCE_TO(BeforeDOCTYPENameState);
1273         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1274             parseError();
1275             m_token->beginDOCTYPE();
1276             m_token->setForceQuirks();
1277             return emitAndReconsumeIn(source, DataState);
1278         } else {
1279             parseError();
1280             RECONSUME_IN(BeforeDOCTYPENameState);
1281         }
1282     }
1283     END_STATE()
1284 
1285     BEGIN_STATE(BeforeDOCTYPENameState) {
1286         if (isTokenizerWhitespace(cc))
1287             ADVANCE_TO(BeforeDOCTYPENameState);
1288         else if (isASCIIUpper(cc)) {
1289             m_token->beginDOCTYPE(toLowerCase(cc));
1290             ADVANCE_TO(DOCTYPENameState);
1291         } else if (cc == '>') {
1292             parseError();
1293             m_token->beginDOCTYPE();
1294             m_token->setForceQuirks();
1295             return emitAndResumeIn(source, DataState);
1296         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1297             parseError();
1298             m_token->beginDOCTYPE();
1299             m_token->setForceQuirks();
1300             return emitAndReconsumeIn(source, DataState);
1301         } else {
1302             m_token->beginDOCTYPE(cc);
1303             ADVANCE_TO(DOCTYPENameState);
1304         }
1305     }
1306     END_STATE()
1307 
1308     BEGIN_STATE(DOCTYPENameState) {
1309         if (isTokenizerWhitespace(cc))
1310             ADVANCE_TO(AfterDOCTYPENameState);
1311         else if (cc == '>')
1312             return emitAndResumeIn(source, DataState);
1313         else if (isASCIIUpper(cc)) {
1314             m_token->appendToName(toLowerCase(cc));
1315             ADVANCE_TO(DOCTYPENameState);
1316         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1317             parseError();
1318             m_token->setForceQuirks();
1319             return emitAndReconsumeIn(source, DataState);
1320         } else {
1321             m_token->appendToName(cc);
1322             ADVANCE_TO(DOCTYPENameState);
1323         }
1324     }
1325     END_STATE()
1326 
1327     BEGIN_STATE(AfterDOCTYPENameState) {
1328         if (isTokenizerWhitespace(cc))
1329             ADVANCE_TO(AfterDOCTYPENameState);
1330         if (cc == '>')
1331             return emitAndResumeIn(source, DataState);
1332         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1333             parseError();
1334             m_token->setForceQuirks();
1335             return emitAndReconsumeIn(source, DataState);
1336         } else {
1337             DEFINE_STATIC_LOCAL(String, publicString, ("public"));
1338             DEFINE_STATIC_LOCAL(String, systemString, ("system"));
1339             if (cc == 'P' || cc == 'p') {
1340                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1341                 if (result == SegmentedString::DidMatch) {
1342                     advanceStringAndASSERTIgnoringCase(source, "public");
1343                     SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1344                 } else if (result == SegmentedString::NotEnoughCharacters)
1345                     return haveBufferedCharacterToken();
1346             } else if (cc == 'S' || cc == 's') {
1347                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1348                 if (result == SegmentedString::DidMatch) {
1349                     advanceStringAndASSERTIgnoringCase(source, "system");
1350                     SWITCH_TO(AfterDOCTYPESystemKeywordState);
1351                 } else if (result == SegmentedString::NotEnoughCharacters)
1352                     return haveBufferedCharacterToken();
1353             }
1354             parseError();
1355             m_token->setForceQuirks();
1356             ADVANCE_TO(BogusDOCTYPEState);
1357         }
1358     }
1359     END_STATE()
1360 
1361     BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1362         if (isTokenizerWhitespace(cc))
1363             ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1364         else if (cc == '"') {
1365             parseError();
1366             m_token->setPublicIdentifierToEmptyString();
1367             ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1368         } else if (cc == '\'') {
1369             parseError();
1370             m_token->setPublicIdentifierToEmptyString();
1371             ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1372         } else if (cc == '>') {
1373             parseError();
1374             m_token->setForceQuirks();
1375             return emitAndResumeIn(source, DataState);
1376         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1377             parseError();
1378             m_token->setForceQuirks();
1379             return emitAndReconsumeIn(source, DataState);
1380         } else {
1381             parseError();
1382             m_token->setForceQuirks();
1383             ADVANCE_TO(BogusDOCTYPEState);
1384         }
1385     }
1386     END_STATE()
1387 
1388     BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1389         if (isTokenizerWhitespace(cc))
1390             ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1391         else if (cc == '"') {
1392             m_token->setPublicIdentifierToEmptyString();
1393             ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1394         } else if (cc == '\'') {
1395             m_token->setPublicIdentifierToEmptyString();
1396             ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1397         } else if (cc == '>') {
1398             parseError();
1399             m_token->setForceQuirks();
1400             return emitAndResumeIn(source, DataState);
1401         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1402             parseError();
1403             m_token->setForceQuirks();
1404             return emitAndReconsumeIn(source, DataState);
1405         } else {
1406             parseError();
1407             m_token->setForceQuirks();
1408             ADVANCE_TO(BogusDOCTYPEState);
1409         }
1410     }
1411     END_STATE()
1412 
1413     BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1414         if (cc == '"')
1415             ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1416         else if (cc == '>') {
1417             parseError();
1418             m_token->setForceQuirks();
1419             return emitAndResumeIn(source, DataState);
1420         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1421             parseError();
1422             m_token->setForceQuirks();
1423             return emitAndReconsumeIn(source, DataState);
1424         } else {
1425             m_token->appendToPublicIdentifier(cc);
1426             ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1427         }
1428     }
1429     END_STATE()
1430 
1431     BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1432         if (cc == '\'')
1433             ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1434         else if (cc == '>') {
1435             parseError();
1436             m_token->setForceQuirks();
1437             return emitAndResumeIn(source, DataState);
1438         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1439             parseError();
1440             m_token->setForceQuirks();
1441             return emitAndReconsumeIn(source, DataState);
1442         } else {
1443             m_token->appendToPublicIdentifier(cc);
1444             ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1445         }
1446     }
1447     END_STATE()
1448 
1449     BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1450         if (isTokenizerWhitespace(cc))
1451             ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1452         else if (cc == '>')
1453             return emitAndResumeIn(source, DataState);
1454         else if (cc == '"') {
1455             parseError();
1456             m_token->setSystemIdentifierToEmptyString();
1457             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1458         } else if (cc == '\'') {
1459             parseError();
1460             m_token->setSystemIdentifierToEmptyString();
1461             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1462         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1463             parseError();
1464             m_token->setForceQuirks();
1465             return emitAndReconsumeIn(source, DataState);
1466         } else {
1467             parseError();
1468             m_token->setForceQuirks();
1469             ADVANCE_TO(BogusDOCTYPEState);
1470         }
1471     }
1472     END_STATE()
1473 
1474     BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1475         if (isTokenizerWhitespace(cc))
1476             ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1477         else if (cc == '>')
1478             return emitAndResumeIn(source, DataState);
1479         else if (cc == '"') {
1480             m_token->setSystemIdentifierToEmptyString();
1481             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1482         } else if (cc == '\'') {
1483             m_token->setSystemIdentifierToEmptyString();
1484             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1485         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1486             parseError();
1487             m_token->setForceQuirks();
1488             return emitAndReconsumeIn(source, DataState);
1489         } else {
1490             parseError();
1491             m_token->setForceQuirks();
1492             ADVANCE_TO(BogusDOCTYPEState);
1493         }
1494     }
1495     END_STATE()
1496 
1497     BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1498         if (isTokenizerWhitespace(cc))
1499             ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1500         else if (cc == '"') {
1501             parseError();
1502             m_token->setSystemIdentifierToEmptyString();
1503             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1504         } else if (cc == '\'') {
1505             parseError();
1506             m_token->setSystemIdentifierToEmptyString();
1507             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1508         } else if (cc == '>') {
1509             parseError();
1510             m_token->setForceQuirks();
1511             return emitAndResumeIn(source, DataState);
1512         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1513             parseError();
1514             m_token->setForceQuirks();
1515             return emitAndReconsumeIn(source, DataState);
1516         } else {
1517             parseError();
1518             m_token->setForceQuirks();
1519             ADVANCE_TO(BogusDOCTYPEState);
1520         }
1521     }
1522     END_STATE()
1523 
1524     BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1525         if (isTokenizerWhitespace(cc))
1526             ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1527         if (cc == '"') {
1528             m_token->setSystemIdentifierToEmptyString();
1529             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1530         } else if (cc == '\'') {
1531             m_token->setSystemIdentifierToEmptyString();
1532             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1533         } else if (cc == '>') {
1534             parseError();
1535             m_token->setForceQuirks();
1536             return emitAndResumeIn(source, DataState);
1537         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1538             parseError();
1539             m_token->setForceQuirks();
1540             return emitAndReconsumeIn(source, DataState);
1541         } else {
1542             parseError();
1543             m_token->setForceQuirks();
1544             ADVANCE_TO(BogusDOCTYPEState);
1545         }
1546     }
1547     END_STATE()
1548 
1549     BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1550         if (cc == '"')
1551             ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1552         else if (cc == '>') {
1553             parseError();
1554             m_token->setForceQuirks();
1555             return emitAndResumeIn(source, DataState);
1556         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1557             parseError();
1558             m_token->setForceQuirks();
1559             return emitAndReconsumeIn(source, DataState);
1560         } else {
1561             m_token->appendToSystemIdentifier(cc);
1562             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1563         }
1564     }
1565     END_STATE()
1566 
1567     BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1568         if (cc == '\'')
1569             ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1570         else if (cc == '>') {
1571             parseError();
1572             m_token->setForceQuirks();
1573             return emitAndResumeIn(source, DataState);
1574         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1575             parseError();
1576             m_token->setForceQuirks();
1577             return emitAndReconsumeIn(source, DataState);
1578         } else {
1579             m_token->appendToSystemIdentifier(cc);
1580             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1581         }
1582     }
1583     END_STATE()
1584 
1585     BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1586         if (isTokenizerWhitespace(cc))
1587             ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1588         else if (cc == '>')
1589             return emitAndResumeIn(source, DataState);
1590         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1591             parseError();
1592             m_token->setForceQuirks();
1593             return emitAndReconsumeIn(source, DataState);
1594         } else {
1595             parseError();
1596             ADVANCE_TO(BogusDOCTYPEState);
1597         }
1598     }
1599     END_STATE()
1600 
1601     BEGIN_STATE(BogusDOCTYPEState) {
1602         if (cc == '>')
1603             return emitAndResumeIn(source, DataState);
1604         else if (cc == InputStreamPreprocessor::endOfFileMarker)
1605             return emitAndReconsumeIn(source, DataState);
1606         ADVANCE_TO(BogusDOCTYPEState);
1607     }
1608     END_STATE()
1609 
1610     BEGIN_STATE(CDATASectionState) {
1611         if (cc == ']')
1612             ADVANCE_TO(CDATASectionRightSquareBracketState);
1613         else if (cc == InputStreamPreprocessor::endOfFileMarker)
1614             RECONSUME_IN(DataState);
1615         else {
1616             bufferCharacter(cc);
1617             ADVANCE_TO(CDATASectionState);
1618         }
1619     }
1620     END_STATE()
1621 
1622     BEGIN_STATE(CDATASectionRightSquareBracketState) {
1623         if (cc == ']')
1624             ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1625         else {
1626             bufferCharacter(']');
1627             RECONSUME_IN(CDATASectionState);
1628         }
1629     }
1630 
1631     BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1632         if (cc == '>')
1633             ADVANCE_TO(DataState);
1634         else {
1635             bufferCharacter(']');
1636             bufferCharacter(']');
1637             RECONSUME_IN(CDATASectionState);
1638         }
1639     }
1640     END_STATE()
1641 
1642     }
1643 
1644     ASSERT_NOT_REACHED();
1645     return false;
1646 }
1647 
updateStateFor(const AtomicString & tagName,Frame * frame)1648 void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame)
1649 {
1650     if (tagName == textareaTag || tagName == titleTag)
1651         setState(RCDATAState);
1652     else if (tagName == plaintextTag)
1653         setState(PLAINTEXTState);
1654     else if (tagName == scriptTag)
1655         setState(ScriptDataState);
1656     else if (tagName == styleTag
1657         || tagName == iframeTag
1658         || tagName == xmpTag
1659         || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame))
1660         || tagName == noframesTag
1661         || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame)))
1662         setState(RAWTEXTState);
1663 }
1664 
temporaryBufferIs(const String & expectedString)1665 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1666 {
1667     return vectorEqualsString(m_temporaryBuffer, expectedString);
1668 }
1669 
addToPossibleEndTag(UChar cc)1670 inline void HTMLTokenizer::addToPossibleEndTag(UChar cc)
1671 {
1672     ASSERT(isEndTagBufferingState(m_state));
1673     m_bufferedEndTagName.append(cc);
1674 }
1675 
isAppropriateEndTag()1676 inline bool HTMLTokenizer::isAppropriateEndTag()
1677 {
1678     return m_bufferedEndTagName == m_appropriateEndTagName;
1679 }
1680 
bufferCharacter(UChar character)1681 inline void HTMLTokenizer::bufferCharacter(UChar character)
1682 {
1683     ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
1684     m_token->ensureIsCharacterToken();
1685     m_token->appendToCharacter(character);
1686 }
1687 
parseError()1688 inline void HTMLTokenizer::parseError()
1689 {
1690     notImplemented();
1691 }
1692 
haveBufferedCharacterToken()1693 inline bool HTMLTokenizer::haveBufferedCharacterToken()
1694 {
1695     return m_token->type() == HTMLToken::Character;
1696 }
1697 
1698 }
1699