1 /*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "config.h"
29 #include "HTMLTokenizer.h"
30
31 #include "HTMLEntityParser.h"
32 #include "HTMLToken.h"
33 #include "HTMLTreeBuilder.h"
34 #include "HTMLNames.h"
35 #include "NotImplemented.h"
36 #include <wtf/ASCIICType.h>
37 #include <wtf/CurrentTime.h>
38 #include <wtf/UnusedParam.h>
39 #include <wtf/text/AtomicString.h>
40 #include <wtf/text/CString.h>
41 #include <wtf/unicode/Unicode.h>
42
43 using namespace WTF;
44
45 namespace WebCore {
46
47 using namespace HTMLNames;
48
49 const UChar HTMLTokenizer::InputStreamPreprocessor::endOfFileMarker = 0;
50
51 namespace {
52
toLowerCase(UChar cc)53 inline UChar toLowerCase(UChar cc)
54 {
55 ASSERT(isASCIIUpper(cc));
56 const int lowerCaseOffset = 0x20;
57 return cc + lowerCaseOffset;
58 }
59
isTokenizerWhitespace(UChar cc)60 inline bool isTokenizerWhitespace(UChar cc)
61 {
62 return cc == ' ' || cc == '\x0A' || cc == '\x09' || cc == '\x0C';
63 }
64
advanceStringAndASSERTIgnoringCase(SegmentedString & source,const char * expectedCharacters)65 inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters)
66 {
67 while (*expectedCharacters)
68 source.advanceAndASSERTIgnoringCase(*expectedCharacters++);
69 }
70
advanceStringAndASSERT(SegmentedString & source,const char * expectedCharacters)71 inline void advanceStringAndASSERT(SegmentedString& source, const char* expectedCharacters)
72 {
73 while (*expectedCharacters)
74 source.advanceAndASSERT(*expectedCharacters++);
75 }
76
vectorEqualsString(const Vector<UChar,32> & vector,const String & string)77 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
78 {
79 if (vector.size() != string.length())
80 return false;
81 const UChar* stringData = string.characters();
82 const UChar* vectorData = vector.data();
83 // FIXME: Is there a higher-level function we should be calling here?
84 return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
85 }
86
isEndTagBufferingState(HTMLTokenizer::State state)87 inline bool isEndTagBufferingState(HTMLTokenizer::State state)
88 {
89 switch (state) {
90 case HTMLTokenizer::RCDATAEndTagOpenState:
91 case HTMLTokenizer::RCDATAEndTagNameState:
92 case HTMLTokenizer::RAWTEXTEndTagOpenState:
93 case HTMLTokenizer::RAWTEXTEndTagNameState:
94 case HTMLTokenizer::ScriptDataEndTagOpenState:
95 case HTMLTokenizer::ScriptDataEndTagNameState:
96 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
97 case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
98 return true;
99 default:
100 return false;
101 }
102 }
103
104 }
105
HTMLTokenizer(bool usePreHTML5ParserQuirks)106 HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks)
107 : m_inputStreamPreprocessor(this)
108 , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks)
109 {
110 reset();
111 }
112
~HTMLTokenizer()113 HTMLTokenizer::~HTMLTokenizer()
114 {
115 }
116
reset()117 void HTMLTokenizer::reset()
118 {
119 m_state = DataState;
120 m_token = 0;
121 m_lineNumber = 0;
122 m_skipLeadingNewLineForListing = false;
123 m_forceNullCharacterReplacement = false;
124 m_shouldAllowCDATA = false;
125 m_additionalAllowedCharacter = '\0';
126 }
127
processEntity(SegmentedString & source)128 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
129 {
130 bool notEnoughCharacters = false;
131 Vector<UChar, 16> decodedEntity;
132 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
133 if (notEnoughCharacters)
134 return false;
135 if (!success) {
136 ASSERT(decodedEntity.isEmpty());
137 bufferCharacter('&');
138 } else {
139 Vector<UChar>::const_iterator iter = decodedEntity.begin();
140 for (; iter != decodedEntity.end(); ++iter)
141 bufferCharacter(*iter);
142 }
143 return true;
144 }
145
146 #if COMPILER(MSVC)
147 // We need to disable the "unreachable code" warning because we want to assert
148 // that some code points aren't reached in the state machine.
149 #pragma warning(disable: 4702)
150 #endif
151
152 #define BEGIN_STATE(stateName) case stateName: stateName:
153 #define END_STATE() ASSERT_NOT_REACHED(); break;
154
155 // We use this macro when the HTML5 spec says "reconsume the current input
156 // character in the <mumble> state."
157 #define RECONSUME_IN(stateName) \
158 do { \
159 m_state = stateName; \
160 goto stateName; \
161 } while (false)
162
163 // We use this macro when the HTML5 spec says "consume the next input
164 // character ... and switch to the <mumble> state."
165 #define ADVANCE_TO(stateName) \
166 do { \
167 m_state = stateName; \
168 if (!m_inputStreamPreprocessor.advance(source, m_lineNumber)) \
169 return haveBufferedCharacterToken(); \
170 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
171 goto stateName; \
172 } while (false)
173
174 // Sometimes there's more complicated logic in the spec that separates when
175 // we consume the next input character and when we switch to a particular
176 // state. We handle those cases by advancing the source directly and using
177 // this macro to switch to the indicated state.
178 #define SWITCH_TO(stateName) \
179 do { \
180 m_state = stateName; \
181 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \
182 return haveBufferedCharacterToken(); \
183 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
184 goto stateName; \
185 } while (false)
186
187
saveEndTagNameIfNeeded()188 inline void HTMLTokenizer::saveEndTagNameIfNeeded()
189 {
190 ASSERT(m_token->type() != HTMLToken::Uninitialized);
191 if (m_token->type() == HTMLToken::StartTag)
192 m_appropriateEndTagName = m_token->name();
193 }
194
195 // We use this function when the HTML5 spec says "Emit the current <mumble>
196 // token. Switch to the <mumble> state." We use the word "resume" instead of
197 // switch to indicate that this macro actually returns and that we'll end up
198 // in the state when we "resume" (i.e., are called again).
emitAndResumeIn(SegmentedString & source,State state)199 bool HTMLTokenizer::emitAndResumeIn(SegmentedString& source, State state)
200 {
201 m_state = state;
202 source.advance(m_lineNumber);
203 saveEndTagNameIfNeeded();
204 return true;
205 }
206
207 // Identical to emitAndResumeIn, except does not advance.
emitAndReconsumeIn(SegmentedString &,State state)208 bool HTMLTokenizer::emitAndReconsumeIn(SegmentedString&, State state)
209 {
210 m_state = state;
211 saveEndTagNameIfNeeded();
212 return true;
213 }
214
215 // Used to emit the EndOfFile token.
216 // Check if we have buffered characters to emit first before emitting the EOF.
emitEndOfFile(SegmentedString & source)217 bool HTMLTokenizer::emitEndOfFile(SegmentedString& source)
218 {
219 if (haveBufferedCharacterToken())
220 return true;
221 m_state = DataState;
222 source.advance(m_lineNumber);
223 m_token->clear();
224 m_token->makeEndOfFile();
225 return true;
226 }
227
flushBufferedEndTag(SegmentedString & source)228 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
229 {
230 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
231 source.advance(m_lineNumber);
232 if (m_token->type() == HTMLToken::Character)
233 return true;
234 m_token->beginEndTag(m_bufferedEndTagName);
235 m_bufferedEndTagName.clear();
236 return false;
237 }
238
239 #define FLUSH_AND_ADVANCE_TO(stateName) \
240 do { \
241 m_state = stateName; \
242 if (flushBufferedEndTag(source)) \
243 return true; \
244 if (source.isEmpty() \
245 || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \
246 return haveBufferedCharacterToken(); \
247 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
248 goto stateName; \
249 } while (false)
250
flushEmitAndResumeIn(SegmentedString & source,State state)251 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, State state)
252 {
253 m_state = state;
254 flushBufferedEndTag(source);
255 return true;
256 }
257
nextToken(SegmentedString & source,HTMLToken & token)258 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
259 {
260 // If we have a token in progress, then we're supposed to be called back
261 // with the same token so we can finish it.
262 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
263 m_token = &token;
264
265 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
266 // FIXME: This should call flushBufferedEndTag().
267 // We started an end tag during our last iteration.
268 m_token->beginEndTag(m_bufferedEndTagName);
269 m_bufferedEndTagName.clear();
270 if (m_state == DataState) {
271 // We're back in the data state, so we must be done with the tag.
272 return true;
273 }
274 }
275
276 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
277 return haveBufferedCharacterToken();
278 UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
279
280 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
281 // Note that this logic is different than the generic \r\n collapsing
282 // handled in the input stream preprocessor. This logic is here as an
283 // "authoring convenience" so folks can write:
284 //
285 // <pre>
286 // lorem ipsum
287 // lorem ipsum
288 // </pre>
289 //
290 // without getting an extra newline at the start of their <pre> element.
291 if (m_skipLeadingNewLineForListing) {
292 m_skipLeadingNewLineForListing = false;
293 if (cc == '\n') {
294 if (m_state == DataState)
295 ADVANCE_TO(DataState);
296 if (m_state == RCDATAState)
297 ADVANCE_TO(RCDATAState);
298 // When parsing text/plain documents, we run the tokenizer in the
299 // PLAINTEXTState and ignore m_skipLeadingNewLineForListing.
300 ASSERT(m_state == PLAINTEXTState);
301 }
302 }
303
304 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
305 switch (m_state) {
306 BEGIN_STATE(DataState) {
307 if (cc == '&')
308 ADVANCE_TO(CharacterReferenceInDataState);
309 else if (cc == '<') {
310 if (m_token->type() == HTMLToken::Character) {
311 // We have a bunch of character tokens queued up that we
312 // are emitting lazily here.
313 return true;
314 }
315 ADVANCE_TO(TagOpenState);
316 } else if (cc == InputStreamPreprocessor::endOfFileMarker)
317 return emitEndOfFile(source);
318 else {
319 bufferCharacter(cc);
320 ADVANCE_TO(DataState);
321 }
322 }
323 END_STATE()
324
325 BEGIN_STATE(CharacterReferenceInDataState) {
326 if (!processEntity(source))
327 return haveBufferedCharacterToken();
328 SWITCH_TO(DataState);
329 }
330 END_STATE()
331
332 BEGIN_STATE(RCDATAState) {
333 if (cc == '&')
334 ADVANCE_TO(CharacterReferenceInRCDATAState);
335 else if (cc == '<')
336 ADVANCE_TO(RCDATALessThanSignState);
337 else if (cc == InputStreamPreprocessor::endOfFileMarker)
338 return emitEndOfFile(source);
339 else {
340 bufferCharacter(cc);
341 ADVANCE_TO(RCDATAState);
342 }
343 }
344 END_STATE()
345
346 BEGIN_STATE(CharacterReferenceInRCDATAState) {
347 if (!processEntity(source))
348 return haveBufferedCharacterToken();
349 SWITCH_TO(RCDATAState);
350 }
351 END_STATE()
352
353 BEGIN_STATE(RAWTEXTState) {
354 if (cc == '<')
355 ADVANCE_TO(RAWTEXTLessThanSignState);
356 else if (cc == InputStreamPreprocessor::endOfFileMarker)
357 return emitEndOfFile(source);
358 else {
359 bufferCharacter(cc);
360 ADVANCE_TO(RAWTEXTState);
361 }
362 }
363 END_STATE()
364
365 BEGIN_STATE(ScriptDataState) {
366 if (cc == '<')
367 ADVANCE_TO(ScriptDataLessThanSignState);
368 else if (cc == InputStreamPreprocessor::endOfFileMarker)
369 return emitEndOfFile(source);
370 else {
371 bufferCharacter(cc);
372 ADVANCE_TO(ScriptDataState);
373 }
374 }
375 END_STATE()
376
377 BEGIN_STATE(PLAINTEXTState) {
378 if (cc == InputStreamPreprocessor::endOfFileMarker)
379 return emitEndOfFile(source);
380 else
381 bufferCharacter(cc);
382 ADVANCE_TO(PLAINTEXTState);
383 }
384 END_STATE()
385
386 BEGIN_STATE(TagOpenState) {
387 if (cc == '!')
388 ADVANCE_TO(MarkupDeclarationOpenState);
389 else if (cc == '/')
390 ADVANCE_TO(EndTagOpenState);
391 else if (isASCIIUpper(cc)) {
392 m_token->beginStartTag(toLowerCase(cc));
393 ADVANCE_TO(TagNameState);
394 } else if (isASCIILower(cc)) {
395 m_token->beginStartTag(cc);
396 ADVANCE_TO(TagNameState);
397 } else if (cc == '?') {
398 parseError();
399 // The spec consumes the current character before switching
400 // to the bogus comment state, but it's easier to implement
401 // if we reconsume the current character.
402 RECONSUME_IN(BogusCommentState);
403 } else {
404 parseError();
405 bufferCharacter('<');
406 RECONSUME_IN(DataState);
407 }
408 }
409 END_STATE()
410
411 BEGIN_STATE(EndTagOpenState) {
412 if (isASCIIUpper(cc)) {
413 m_token->beginEndTag(toLowerCase(cc));
414 ADVANCE_TO(TagNameState);
415 } else if (isASCIILower(cc)) {
416 m_token->beginEndTag(cc);
417 ADVANCE_TO(TagNameState);
418 } else if (cc == '>') {
419 parseError();
420 ADVANCE_TO(DataState);
421 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
422 parseError();
423 bufferCharacter('<');
424 bufferCharacter('/');
425 RECONSUME_IN(DataState);
426 } else {
427 parseError();
428 RECONSUME_IN(BogusCommentState);
429 }
430 }
431 END_STATE()
432
433 BEGIN_STATE(TagNameState) {
434 if (isTokenizerWhitespace(cc))
435 ADVANCE_TO(BeforeAttributeNameState);
436 else if (cc == '/')
437 ADVANCE_TO(SelfClosingStartTagState);
438 else if (cc == '>')
439 return emitAndResumeIn(source, DataState);
440 else if (m_usePreHTML5ParserQuirks && cc == '<')
441 return emitAndReconsumeIn(source, DataState);
442 else if (isASCIIUpper(cc)) {
443 m_token->appendToName(toLowerCase(cc));
444 ADVANCE_TO(TagNameState);
445 } if (cc == InputStreamPreprocessor::endOfFileMarker) {
446 parseError();
447 RECONSUME_IN(DataState);
448 } else {
449 m_token->appendToName(cc);
450 ADVANCE_TO(TagNameState);
451 }
452 }
453 END_STATE()
454
455 BEGIN_STATE(RCDATALessThanSignState) {
456 if (cc == '/') {
457 m_temporaryBuffer.clear();
458 ASSERT(m_bufferedEndTagName.isEmpty());
459 ADVANCE_TO(RCDATAEndTagOpenState);
460 } else {
461 bufferCharacter('<');
462 RECONSUME_IN(RCDATAState);
463 }
464 }
465 END_STATE()
466
467 BEGIN_STATE(RCDATAEndTagOpenState) {
468 if (isASCIIUpper(cc)) {
469 m_temporaryBuffer.append(cc);
470 addToPossibleEndTag(toLowerCase(cc));
471 ADVANCE_TO(RCDATAEndTagNameState);
472 } else if (isASCIILower(cc)) {
473 m_temporaryBuffer.append(cc);
474 addToPossibleEndTag(cc);
475 ADVANCE_TO(RCDATAEndTagNameState);
476 } else {
477 bufferCharacter('<');
478 bufferCharacter('/');
479 RECONSUME_IN(RCDATAState);
480 }
481 }
482 END_STATE()
483
484 BEGIN_STATE(RCDATAEndTagNameState) {
485 if (isASCIIUpper(cc)) {
486 m_temporaryBuffer.append(cc);
487 addToPossibleEndTag(toLowerCase(cc));
488 ADVANCE_TO(RCDATAEndTagNameState);
489 } else if (isASCIILower(cc)) {
490 m_temporaryBuffer.append(cc);
491 addToPossibleEndTag(cc);
492 ADVANCE_TO(RCDATAEndTagNameState);
493 } else {
494 if (isTokenizerWhitespace(cc)) {
495 if (isAppropriateEndTag())
496 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
497 } else if (cc == '/') {
498 if (isAppropriateEndTag())
499 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
500 } else if (cc == '>') {
501 if (isAppropriateEndTag())
502 return flushEmitAndResumeIn(source, DataState);
503 }
504 bufferCharacter('<');
505 bufferCharacter('/');
506 m_token->appendToCharacter(m_temporaryBuffer);
507 m_bufferedEndTagName.clear();
508 RECONSUME_IN(RCDATAState);
509 }
510 }
511 END_STATE()
512
513 BEGIN_STATE(RAWTEXTLessThanSignState) {
514 if (cc == '/') {
515 m_temporaryBuffer.clear();
516 ASSERT(m_bufferedEndTagName.isEmpty());
517 ADVANCE_TO(RAWTEXTEndTagOpenState);
518 } else {
519 bufferCharacter('<');
520 RECONSUME_IN(RAWTEXTState);
521 }
522 }
523 END_STATE()
524
525 BEGIN_STATE(RAWTEXTEndTagOpenState) {
526 if (isASCIIUpper(cc)) {
527 m_temporaryBuffer.append(cc);
528 addToPossibleEndTag(toLowerCase(cc));
529 ADVANCE_TO(RAWTEXTEndTagNameState);
530 } else if (isASCIILower(cc)) {
531 m_temporaryBuffer.append(cc);
532 addToPossibleEndTag(cc);
533 ADVANCE_TO(RAWTEXTEndTagNameState);
534 } else {
535 bufferCharacter('<');
536 bufferCharacter('/');
537 RECONSUME_IN(RAWTEXTState);
538 }
539 }
540 END_STATE()
541
542 BEGIN_STATE(RAWTEXTEndTagNameState) {
543 if (isASCIIUpper(cc)) {
544 m_temporaryBuffer.append(cc);
545 addToPossibleEndTag(toLowerCase(cc));
546 ADVANCE_TO(RAWTEXTEndTagNameState);
547 } else if (isASCIILower(cc)) {
548 m_temporaryBuffer.append(cc);
549 addToPossibleEndTag(cc);
550 ADVANCE_TO(RAWTEXTEndTagNameState);
551 } else {
552 if (isTokenizerWhitespace(cc)) {
553 if (isAppropriateEndTag())
554 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
555 } else if (cc == '/') {
556 if (isAppropriateEndTag())
557 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
558 } else if (cc == '>') {
559 if (isAppropriateEndTag())
560 return flushEmitAndResumeIn(source, DataState);
561 }
562 bufferCharacter('<');
563 bufferCharacter('/');
564 m_token->appendToCharacter(m_temporaryBuffer);
565 m_bufferedEndTagName.clear();
566 RECONSUME_IN(RAWTEXTState);
567 }
568 }
569 END_STATE()
570
571 BEGIN_STATE(ScriptDataLessThanSignState) {
572 if (cc == '/') {
573 m_temporaryBuffer.clear();
574 ASSERT(m_bufferedEndTagName.isEmpty());
575 ADVANCE_TO(ScriptDataEndTagOpenState);
576 } else if (cc == '!') {
577 bufferCharacter('<');
578 bufferCharacter('!');
579 ADVANCE_TO(ScriptDataEscapeStartState);
580 } else {
581 bufferCharacter('<');
582 RECONSUME_IN(ScriptDataState);
583 }
584 }
585 END_STATE()
586
587 BEGIN_STATE(ScriptDataEndTagOpenState) {
588 if (isASCIIUpper(cc)) {
589 m_temporaryBuffer.append(cc);
590 addToPossibleEndTag(toLowerCase(cc));
591 ADVANCE_TO(ScriptDataEndTagNameState);
592 } else if (isASCIILower(cc)) {
593 m_temporaryBuffer.append(cc);
594 addToPossibleEndTag(cc);
595 ADVANCE_TO(ScriptDataEndTagNameState);
596 } else {
597 bufferCharacter('<');
598 bufferCharacter('/');
599 RECONSUME_IN(ScriptDataState);
600 }
601 }
602 END_STATE()
603
604 BEGIN_STATE(ScriptDataEndTagNameState) {
605 if (isASCIIUpper(cc)) {
606 m_temporaryBuffer.append(cc);
607 addToPossibleEndTag(toLowerCase(cc));
608 ADVANCE_TO(ScriptDataEndTagNameState);
609 } else if (isASCIILower(cc)) {
610 m_temporaryBuffer.append(cc);
611 addToPossibleEndTag(cc);
612 ADVANCE_TO(ScriptDataEndTagNameState);
613 } else {
614 if (isTokenizerWhitespace(cc)) {
615 if (isAppropriateEndTag())
616 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
617 } else if (cc == '/') {
618 if (isAppropriateEndTag())
619 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
620 } else if (cc == '>') {
621 if (isAppropriateEndTag())
622 return flushEmitAndResumeIn(source, DataState);
623 }
624 bufferCharacter('<');
625 bufferCharacter('/');
626 m_token->appendToCharacter(m_temporaryBuffer);
627 m_bufferedEndTagName.clear();
628 RECONSUME_IN(ScriptDataState);
629 }
630 }
631 END_STATE()
632
633 BEGIN_STATE(ScriptDataEscapeStartState) {
634 if (cc == '-') {
635 bufferCharacter(cc);
636 ADVANCE_TO(ScriptDataEscapeStartDashState);
637 } else
638 RECONSUME_IN(ScriptDataState);
639 }
640 END_STATE()
641
642 BEGIN_STATE(ScriptDataEscapeStartDashState) {
643 if (cc == '-') {
644 bufferCharacter(cc);
645 ADVANCE_TO(ScriptDataEscapedDashDashState);
646 } else
647 RECONSUME_IN(ScriptDataState);
648 }
649 END_STATE()
650
651 BEGIN_STATE(ScriptDataEscapedState) {
652 if (cc == '-') {
653 bufferCharacter(cc);
654 ADVANCE_TO(ScriptDataEscapedDashState);
655 } else if (cc == '<')
656 ADVANCE_TO(ScriptDataEscapedLessThanSignState);
657 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
658 parseError();
659 RECONSUME_IN(DataState);
660 } else {
661 bufferCharacter(cc);
662 ADVANCE_TO(ScriptDataEscapedState);
663 }
664 }
665 END_STATE()
666
667 BEGIN_STATE(ScriptDataEscapedDashState) {
668 if (cc == '-') {
669 bufferCharacter(cc);
670 ADVANCE_TO(ScriptDataEscapedDashDashState);
671 } else if (cc == '<')
672 ADVANCE_TO(ScriptDataEscapedLessThanSignState);
673 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
674 parseError();
675 RECONSUME_IN(DataState);
676 } else {
677 bufferCharacter(cc);
678 ADVANCE_TO(ScriptDataEscapedState);
679 }
680 }
681 END_STATE()
682
683 BEGIN_STATE(ScriptDataEscapedDashDashState) {
684 if (cc == '-') {
685 bufferCharacter(cc);
686 ADVANCE_TO(ScriptDataEscapedDashDashState);
687 } else if (cc == '<')
688 ADVANCE_TO(ScriptDataEscapedLessThanSignState);
689 else if (cc == '>') {
690 bufferCharacter(cc);
691 ADVANCE_TO(ScriptDataState);
692 } if (cc == InputStreamPreprocessor::endOfFileMarker) {
693 parseError();
694 RECONSUME_IN(DataState);
695 } else {
696 bufferCharacter(cc);
697 ADVANCE_TO(ScriptDataEscapedState);
698 }
699 }
700 END_STATE()
701
702 BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
703 if (cc == '/') {
704 m_temporaryBuffer.clear();
705 ASSERT(m_bufferedEndTagName.isEmpty());
706 ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
707 } else if (isASCIIUpper(cc)) {
708 bufferCharacter('<');
709 bufferCharacter(cc);
710 m_temporaryBuffer.clear();
711 m_temporaryBuffer.append(toLowerCase(cc));
712 ADVANCE_TO(ScriptDataDoubleEscapeStartState);
713 } else if (isASCIILower(cc)) {
714 bufferCharacter('<');
715 bufferCharacter(cc);
716 m_temporaryBuffer.clear();
717 m_temporaryBuffer.append(cc);
718 ADVANCE_TO(ScriptDataDoubleEscapeStartState);
719 } else {
720 bufferCharacter('<');
721 RECONSUME_IN(ScriptDataEscapedState);
722 }
723 }
724 END_STATE()
725
726 BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
727 if (isASCIIUpper(cc)) {
728 m_temporaryBuffer.append(cc);
729 addToPossibleEndTag(toLowerCase(cc));
730 ADVANCE_TO(ScriptDataEscapedEndTagNameState);
731 } else if (isASCIILower(cc)) {
732 m_temporaryBuffer.append(cc);
733 addToPossibleEndTag(cc);
734 ADVANCE_TO(ScriptDataEscapedEndTagNameState);
735 } else {
736 bufferCharacter('<');
737 bufferCharacter('/');
738 RECONSUME_IN(ScriptDataEscapedState);
739 }
740 }
741 END_STATE()
742
743 BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
744 if (isASCIIUpper(cc)) {
745 m_temporaryBuffer.append(cc);
746 addToPossibleEndTag(toLowerCase(cc));
747 ADVANCE_TO(ScriptDataEscapedEndTagNameState);
748 } else if (isASCIILower(cc)) {
749 m_temporaryBuffer.append(cc);
750 addToPossibleEndTag(cc);
751 ADVANCE_TO(ScriptDataEscapedEndTagNameState);
752 } else {
753 if (isTokenizerWhitespace(cc)) {
754 if (isAppropriateEndTag())
755 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
756 } else if (cc == '/') {
757 if (isAppropriateEndTag())
758 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
759 } else if (cc == '>') {
760 if (isAppropriateEndTag())
761 return flushEmitAndResumeIn(source, DataState);
762 }
763 bufferCharacter('<');
764 bufferCharacter('/');
765 m_token->appendToCharacter(m_temporaryBuffer);
766 m_bufferedEndTagName.clear();
767 RECONSUME_IN(ScriptDataEscapedState);
768 }
769 }
770 END_STATE()
771
772 BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
773 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
774 bufferCharacter(cc);
775 if (temporaryBufferIs(scriptTag.localName()))
776 ADVANCE_TO(ScriptDataDoubleEscapedState);
777 else
778 ADVANCE_TO(ScriptDataEscapedState);
779 } else if (isASCIIUpper(cc)) {
780 bufferCharacter(cc);
781 m_temporaryBuffer.append(toLowerCase(cc));
782 ADVANCE_TO(ScriptDataDoubleEscapeStartState);
783 } else if (isASCIILower(cc)) {
784 bufferCharacter(cc);
785 m_temporaryBuffer.append(cc);
786 ADVANCE_TO(ScriptDataDoubleEscapeStartState);
787 } else
788 RECONSUME_IN(ScriptDataEscapedState);
789 }
790 END_STATE()
791
792 BEGIN_STATE(ScriptDataDoubleEscapedState) {
793 if (cc == '-') {
794 bufferCharacter(cc);
795 ADVANCE_TO(ScriptDataDoubleEscapedDashState);
796 } else if (cc == '<') {
797 bufferCharacter(cc);
798 ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
799 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
800 parseError();
801 RECONSUME_IN(DataState);
802 } else {
803 bufferCharacter(cc);
804 ADVANCE_TO(ScriptDataDoubleEscapedState);
805 }
806 }
807 END_STATE()
808
809 BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
810 if (cc == '-') {
811 bufferCharacter(cc);
812 ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
813 } else if (cc == '<') {
814 bufferCharacter(cc);
815 ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
816 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
817 parseError();
818 RECONSUME_IN(DataState);
819 } else {
820 bufferCharacter(cc);
821 ADVANCE_TO(ScriptDataDoubleEscapedState);
822 }
823 }
824 END_STATE()
825
826 BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
827 if (cc == '-') {
828 bufferCharacter(cc);
829 ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
830 } else if (cc == '<') {
831 bufferCharacter(cc);
832 ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
833 } else if (cc == '>') {
834 bufferCharacter(cc);
835 ADVANCE_TO(ScriptDataState);
836 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
837 parseError();
838 RECONSUME_IN(DataState);
839 } else {
840 bufferCharacter(cc);
841 ADVANCE_TO(ScriptDataDoubleEscapedState);
842 }
843 }
844 END_STATE()
845
846 BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
847 if (cc == '/') {
848 bufferCharacter(cc);
849 m_temporaryBuffer.clear();
850 ADVANCE_TO(ScriptDataDoubleEscapeEndState);
851 } else
852 RECONSUME_IN(ScriptDataDoubleEscapedState);
853 }
854 END_STATE()
855
856 BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
857 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
858 bufferCharacter(cc);
859 if (temporaryBufferIs(scriptTag.localName()))
860 ADVANCE_TO(ScriptDataEscapedState);
861 else
862 ADVANCE_TO(ScriptDataDoubleEscapedState);
863 } else if (isASCIIUpper(cc)) {
864 bufferCharacter(cc);
865 m_temporaryBuffer.append(toLowerCase(cc));
866 ADVANCE_TO(ScriptDataDoubleEscapeEndState);
867 } else if (isASCIILower(cc)) {
868 bufferCharacter(cc);
869 m_temporaryBuffer.append(cc);
870 ADVANCE_TO(ScriptDataDoubleEscapeEndState);
871 } else
872 RECONSUME_IN(ScriptDataDoubleEscapedState);
873 }
874 END_STATE()
875
876 BEGIN_STATE(BeforeAttributeNameState) {
877 if (isTokenizerWhitespace(cc))
878 ADVANCE_TO(BeforeAttributeNameState);
879 else if (cc == '/')
880 ADVANCE_TO(SelfClosingStartTagState);
881 else if (cc == '>')
882 return emitAndResumeIn(source, DataState);
883 else if (m_usePreHTML5ParserQuirks && cc == '<')
884 return emitAndReconsumeIn(source, DataState);
885 else if (isASCIIUpper(cc)) {
886 m_token->addNewAttribute();
887 m_token->beginAttributeName(source.numberOfCharactersConsumed());
888 m_token->appendToAttributeName(toLowerCase(cc));
889 ADVANCE_TO(AttributeNameState);
890 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
891 parseError();
892 RECONSUME_IN(DataState);
893 } else {
894 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
895 parseError();
896 m_token->addNewAttribute();
897 m_token->beginAttributeName(source.numberOfCharactersConsumed());
898 m_token->appendToAttributeName(cc);
899 ADVANCE_TO(AttributeNameState);
900 }
901 }
902 END_STATE()
903
904 BEGIN_STATE(AttributeNameState) {
905 if (isTokenizerWhitespace(cc)) {
906 m_token->endAttributeName(source.numberOfCharactersConsumed());
907 ADVANCE_TO(AfterAttributeNameState);
908 } else if (cc == '/') {
909 m_token->endAttributeName(source.numberOfCharactersConsumed());
910 ADVANCE_TO(SelfClosingStartTagState);
911 } else if (cc == '=') {
912 m_token->endAttributeName(source.numberOfCharactersConsumed());
913 ADVANCE_TO(BeforeAttributeValueState);
914 } else if (cc == '>') {
915 m_token->endAttributeName(source.numberOfCharactersConsumed());
916 return emitAndResumeIn(source, DataState);
917 } else if (m_usePreHTML5ParserQuirks && cc == '<') {
918 m_token->endAttributeName(source.numberOfCharactersConsumed());
919 return emitAndReconsumeIn(source, DataState);
920 } else if (isASCIIUpper(cc)) {
921 m_token->appendToAttributeName(toLowerCase(cc));
922 ADVANCE_TO(AttributeNameState);
923 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
924 parseError();
925 m_token->endAttributeName(source.numberOfCharactersConsumed());
926 RECONSUME_IN(DataState);
927 } else {
928 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
929 parseError();
930 m_token->appendToAttributeName(cc);
931 ADVANCE_TO(AttributeNameState);
932 }
933 }
934 END_STATE()
935
936 BEGIN_STATE(AfterAttributeNameState) {
937 if (isTokenizerWhitespace(cc))
938 ADVANCE_TO(AfterAttributeNameState);
939 else if (cc == '/')
940 ADVANCE_TO(SelfClosingStartTagState);
941 else if (cc == '=')
942 ADVANCE_TO(BeforeAttributeValueState);
943 else if (cc == '>')
944 return emitAndResumeIn(source, DataState);
945 else if (m_usePreHTML5ParserQuirks && cc == '<')
946 return emitAndReconsumeIn(source, DataState);
947 else if (isASCIIUpper(cc)) {
948 m_token->addNewAttribute();
949 m_token->beginAttributeName(source.numberOfCharactersConsumed());
950 m_token->appendToAttributeName(toLowerCase(cc));
951 ADVANCE_TO(AttributeNameState);
952 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
953 parseError();
954 RECONSUME_IN(DataState);
955 } else {
956 if (cc == '"' || cc == '\'' || cc == '<')
957 parseError();
958 m_token->addNewAttribute();
959 m_token->beginAttributeName(source.numberOfCharactersConsumed());
960 m_token->appendToAttributeName(cc);
961 ADVANCE_TO(AttributeNameState);
962 }
963 }
964 END_STATE()
965
966 BEGIN_STATE(BeforeAttributeValueState) {
967 if (isTokenizerWhitespace(cc))
968 ADVANCE_TO(BeforeAttributeValueState);
969 else if (cc == '"') {
970 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
971 ADVANCE_TO(AttributeValueDoubleQuotedState);
972 } else if (cc == '&') {
973 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
974 RECONSUME_IN(AttributeValueUnquotedState);
975 } else if (cc == '\'') {
976 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
977 ADVANCE_TO(AttributeValueSingleQuotedState);
978 } else if (cc == '>') {
979 parseError();
980 return emitAndResumeIn(source, DataState);
981 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
982 parseError();
983 RECONSUME_IN(DataState);
984 } else {
985 if (cc == '<' || cc == '=' || cc == '`')
986 parseError();
987 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
988 m_token->appendToAttributeValue(cc);
989 ADVANCE_TO(AttributeValueUnquotedState);
990 }
991 }
992 END_STATE()
993
994 BEGIN_STATE(AttributeValueDoubleQuotedState) {
995 if (cc == '"') {
996 m_token->endAttributeValue(source.numberOfCharactersConsumed());
997 ADVANCE_TO(AfterAttributeValueQuotedState);
998 } else if (cc == '&') {
999 m_additionalAllowedCharacter = '"';
1000 ADVANCE_TO(CharacterReferenceInAttributeValueState);
1001 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1002 parseError();
1003 m_token->endAttributeValue(source.numberOfCharactersConsumed());
1004 RECONSUME_IN(DataState);
1005 } else {
1006 m_token->appendToAttributeValue(cc);
1007 ADVANCE_TO(AttributeValueDoubleQuotedState);
1008 }
1009 }
1010 END_STATE()
1011
1012 BEGIN_STATE(AttributeValueSingleQuotedState) {
1013 if (cc == '\'') {
1014 m_token->endAttributeValue(source.numberOfCharactersConsumed());
1015 ADVANCE_TO(AfterAttributeValueQuotedState);
1016 } else if (cc == '&') {
1017 m_additionalAllowedCharacter = '\'';
1018 ADVANCE_TO(CharacterReferenceInAttributeValueState);
1019 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1020 parseError();
1021 m_token->endAttributeValue(source.numberOfCharactersConsumed());
1022 RECONSUME_IN(DataState);
1023 } else {
1024 m_token->appendToAttributeValue(cc);
1025 ADVANCE_TO(AttributeValueSingleQuotedState);
1026 }
1027 }
1028 END_STATE()
1029
1030 BEGIN_STATE(AttributeValueUnquotedState) {
1031 if (isTokenizerWhitespace(cc)) {
1032 m_token->endAttributeValue(source.numberOfCharactersConsumed());
1033 ADVANCE_TO(BeforeAttributeNameState);
1034 } else if (cc == '&') {
1035 m_additionalAllowedCharacter = '>';
1036 ADVANCE_TO(CharacterReferenceInAttributeValueState);
1037 } else if (cc == '>') {
1038 m_token->endAttributeValue(source.numberOfCharactersConsumed());
1039 return emitAndResumeIn(source, DataState);
1040 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1041 parseError();
1042 m_token->endAttributeValue(source.numberOfCharactersConsumed());
1043 RECONSUME_IN(DataState);
1044 } else {
1045 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
1046 parseError();
1047 m_token->appendToAttributeValue(cc);
1048 ADVANCE_TO(AttributeValueUnquotedState);
1049 }
1050 }
1051 END_STATE()
1052
1053 BEGIN_STATE(CharacterReferenceInAttributeValueState) {
1054 bool notEnoughCharacters = false;
1055 Vector<UChar, 16> decodedEntity;
1056 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
1057 if (notEnoughCharacters)
1058 return haveBufferedCharacterToken();
1059 if (!success) {
1060 ASSERT(decodedEntity.isEmpty());
1061 m_token->appendToAttributeValue('&');
1062 } else {
1063 Vector<UChar>::const_iterator iter = decodedEntity.begin();
1064 for (; iter != decodedEntity.end(); ++iter)
1065 m_token->appendToAttributeValue(*iter);
1066 }
1067 // We're supposed to switch back to the attribute value state that
1068 // we were in when we were switched into this state. Rather than
1069 // keeping track of this explictly, we observe that the previous
1070 // state can be determined by m_additionalAllowedCharacter.
1071 if (m_additionalAllowedCharacter == '"')
1072 SWITCH_TO(AttributeValueDoubleQuotedState);
1073 else if (m_additionalAllowedCharacter == '\'')
1074 SWITCH_TO(AttributeValueSingleQuotedState);
1075 else if (m_additionalAllowedCharacter == '>')
1076 SWITCH_TO(AttributeValueUnquotedState);
1077 else
1078 ASSERT_NOT_REACHED();
1079 }
1080 END_STATE()
1081
1082 BEGIN_STATE(AfterAttributeValueQuotedState) {
1083 if (isTokenizerWhitespace(cc))
1084 ADVANCE_TO(BeforeAttributeNameState);
1085 else if (cc == '/')
1086 ADVANCE_TO(SelfClosingStartTagState);
1087 else if (cc == '>')
1088 return emitAndResumeIn(source, DataState);
1089 else if (m_usePreHTML5ParserQuirks && cc == '<')
1090 return emitAndReconsumeIn(source, DataState);
1091 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1092 parseError();
1093 RECONSUME_IN(DataState);
1094 } else {
1095 parseError();
1096 RECONSUME_IN(BeforeAttributeNameState);
1097 }
1098 }
1099 END_STATE()
1100
1101 BEGIN_STATE(SelfClosingStartTagState) {
1102 if (cc == '>') {
1103 m_token->setSelfClosing();
1104 return emitAndResumeIn(source, DataState);
1105 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1106 parseError();
1107 RECONSUME_IN(DataState);
1108 } else {
1109 parseError();
1110 RECONSUME_IN(BeforeAttributeNameState);
1111 }
1112 }
1113 END_STATE()
1114
1115 BEGIN_STATE(BogusCommentState) {
1116 m_token->beginComment();
1117 RECONSUME_IN(ContinueBogusCommentState);
1118 }
1119 END_STATE()
1120
1121 BEGIN_STATE(ContinueBogusCommentState) {
1122 if (cc == '>')
1123 return emitAndResumeIn(source, DataState);
1124 else if (cc == InputStreamPreprocessor::endOfFileMarker)
1125 return emitAndReconsumeIn(source, DataState);
1126 else {
1127 m_token->appendToComment(cc);
1128 ADVANCE_TO(ContinueBogusCommentState);
1129 }
1130 }
1131 END_STATE()
1132
1133 BEGIN_STATE(MarkupDeclarationOpenState) {
1134 DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
1135 DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
1136 DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA["));
1137 if (cc == '-') {
1138 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1139 if (result == SegmentedString::DidMatch) {
1140 source.advanceAndASSERT('-');
1141 source.advanceAndASSERT('-');
1142 m_token->beginComment();
1143 SWITCH_TO(CommentStartState);
1144 } else if (result == SegmentedString::NotEnoughCharacters)
1145 return haveBufferedCharacterToken();
1146 } else if (cc == 'D' || cc == 'd') {
1147 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1148 if (result == SegmentedString::DidMatch) {
1149 advanceStringAndASSERTIgnoringCase(source, "doctype");
1150 SWITCH_TO(DOCTYPEState);
1151 } else if (result == SegmentedString::NotEnoughCharacters)
1152 return haveBufferedCharacterToken();
1153 } else if (cc == '[' && shouldAllowCDATA()) {
1154 SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
1155 if (result == SegmentedString::DidMatch) {
1156 advanceStringAndASSERT(source, "[CDATA[");
1157 SWITCH_TO(CDATASectionState);
1158 } else if (result == SegmentedString::NotEnoughCharacters)
1159 return haveBufferedCharacterToken();
1160 }
1161 parseError();
1162 RECONSUME_IN(BogusCommentState);
1163 }
1164 END_STATE()
1165
1166 BEGIN_STATE(CommentStartState) {
1167 if (cc == '-')
1168 ADVANCE_TO(CommentStartDashState);
1169 else if (cc == '>') {
1170 parseError();
1171 return emitAndResumeIn(source, DataState);
1172 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1173 parseError();
1174 return emitAndReconsumeIn(source, DataState);
1175 } else {
1176 m_token->appendToComment(cc);
1177 ADVANCE_TO(CommentState);
1178 }
1179 }
1180 END_STATE()
1181
1182 BEGIN_STATE(CommentStartDashState) {
1183 if (cc == '-')
1184 ADVANCE_TO(CommentEndState);
1185 else if (cc == '>') {
1186 parseError();
1187 return emitAndResumeIn(source, DataState);
1188 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1189 parseError();
1190 return emitAndReconsumeIn(source, DataState);
1191 } else {
1192 m_token->appendToComment('-');
1193 m_token->appendToComment(cc);
1194 ADVANCE_TO(CommentState);
1195 }
1196 }
1197 END_STATE()
1198
1199 BEGIN_STATE(CommentState) {
1200 if (cc == '-')
1201 ADVANCE_TO(CommentEndDashState);
1202 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1203 parseError();
1204 return emitAndReconsumeIn(source, DataState);
1205 } else {
1206 m_token->appendToComment(cc);
1207 ADVANCE_TO(CommentState);
1208 }
1209 }
1210 END_STATE()
1211
1212 BEGIN_STATE(CommentEndDashState) {
1213 if (cc == '-')
1214 ADVANCE_TO(CommentEndState);
1215 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1216 parseError();
1217 return emitAndReconsumeIn(source, DataState);
1218 } else {
1219 m_token->appendToComment('-');
1220 m_token->appendToComment(cc);
1221 ADVANCE_TO(CommentState);
1222 }
1223 }
1224 END_STATE()
1225
1226 BEGIN_STATE(CommentEndState) {
1227 if (cc == '>')
1228 return emitAndResumeIn(source, DataState);
1229 else if (cc == '!') {
1230 parseError();
1231 ADVANCE_TO(CommentEndBangState);
1232 } else if (cc == '-') {
1233 parseError();
1234 m_token->appendToComment('-');
1235 ADVANCE_TO(CommentEndState);
1236 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1237 parseError();
1238 return emitAndReconsumeIn(source, DataState);
1239 } else {
1240 parseError();
1241 m_token->appendToComment('-');
1242 m_token->appendToComment('-');
1243 m_token->appendToComment(cc);
1244 ADVANCE_TO(CommentState);
1245 }
1246 }
1247 END_STATE()
1248
1249 BEGIN_STATE(CommentEndBangState) {
1250 if (cc == '-') {
1251 m_token->appendToComment('-');
1252 m_token->appendToComment('-');
1253 m_token->appendToComment('!');
1254 ADVANCE_TO(CommentEndDashState);
1255 } else if (cc == '>')
1256 return emitAndResumeIn(source, DataState);
1257 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1258 parseError();
1259 return emitAndReconsumeIn(source, DataState);
1260 } else {
1261 m_token->appendToComment('-');
1262 m_token->appendToComment('-');
1263 m_token->appendToComment('!');
1264 m_token->appendToComment(cc);
1265 ADVANCE_TO(CommentState);
1266 }
1267 }
1268 END_STATE()
1269
1270 BEGIN_STATE(DOCTYPEState) {
1271 if (isTokenizerWhitespace(cc))
1272 ADVANCE_TO(BeforeDOCTYPENameState);
1273 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1274 parseError();
1275 m_token->beginDOCTYPE();
1276 m_token->setForceQuirks();
1277 return emitAndReconsumeIn(source, DataState);
1278 } else {
1279 parseError();
1280 RECONSUME_IN(BeforeDOCTYPENameState);
1281 }
1282 }
1283 END_STATE()
1284
1285 BEGIN_STATE(BeforeDOCTYPENameState) {
1286 if (isTokenizerWhitespace(cc))
1287 ADVANCE_TO(BeforeDOCTYPENameState);
1288 else if (isASCIIUpper(cc)) {
1289 m_token->beginDOCTYPE(toLowerCase(cc));
1290 ADVANCE_TO(DOCTYPENameState);
1291 } else if (cc == '>') {
1292 parseError();
1293 m_token->beginDOCTYPE();
1294 m_token->setForceQuirks();
1295 return emitAndResumeIn(source, DataState);
1296 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1297 parseError();
1298 m_token->beginDOCTYPE();
1299 m_token->setForceQuirks();
1300 return emitAndReconsumeIn(source, DataState);
1301 } else {
1302 m_token->beginDOCTYPE(cc);
1303 ADVANCE_TO(DOCTYPENameState);
1304 }
1305 }
1306 END_STATE()
1307
1308 BEGIN_STATE(DOCTYPENameState) {
1309 if (isTokenizerWhitespace(cc))
1310 ADVANCE_TO(AfterDOCTYPENameState);
1311 else if (cc == '>')
1312 return emitAndResumeIn(source, DataState);
1313 else if (isASCIIUpper(cc)) {
1314 m_token->appendToName(toLowerCase(cc));
1315 ADVANCE_TO(DOCTYPENameState);
1316 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1317 parseError();
1318 m_token->setForceQuirks();
1319 return emitAndReconsumeIn(source, DataState);
1320 } else {
1321 m_token->appendToName(cc);
1322 ADVANCE_TO(DOCTYPENameState);
1323 }
1324 }
1325 END_STATE()
1326
1327 BEGIN_STATE(AfterDOCTYPENameState) {
1328 if (isTokenizerWhitespace(cc))
1329 ADVANCE_TO(AfterDOCTYPENameState);
1330 if (cc == '>')
1331 return emitAndResumeIn(source, DataState);
1332 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1333 parseError();
1334 m_token->setForceQuirks();
1335 return emitAndReconsumeIn(source, DataState);
1336 } else {
1337 DEFINE_STATIC_LOCAL(String, publicString, ("public"));
1338 DEFINE_STATIC_LOCAL(String, systemString, ("system"));
1339 if (cc == 'P' || cc == 'p') {
1340 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1341 if (result == SegmentedString::DidMatch) {
1342 advanceStringAndASSERTIgnoringCase(source, "public");
1343 SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1344 } else if (result == SegmentedString::NotEnoughCharacters)
1345 return haveBufferedCharacterToken();
1346 } else if (cc == 'S' || cc == 's') {
1347 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1348 if (result == SegmentedString::DidMatch) {
1349 advanceStringAndASSERTIgnoringCase(source, "system");
1350 SWITCH_TO(AfterDOCTYPESystemKeywordState);
1351 } else if (result == SegmentedString::NotEnoughCharacters)
1352 return haveBufferedCharacterToken();
1353 }
1354 parseError();
1355 m_token->setForceQuirks();
1356 ADVANCE_TO(BogusDOCTYPEState);
1357 }
1358 }
1359 END_STATE()
1360
1361 BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1362 if (isTokenizerWhitespace(cc))
1363 ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1364 else if (cc == '"') {
1365 parseError();
1366 m_token->setPublicIdentifierToEmptyString();
1367 ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1368 } else if (cc == '\'') {
1369 parseError();
1370 m_token->setPublicIdentifierToEmptyString();
1371 ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1372 } else if (cc == '>') {
1373 parseError();
1374 m_token->setForceQuirks();
1375 return emitAndResumeIn(source, DataState);
1376 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1377 parseError();
1378 m_token->setForceQuirks();
1379 return emitAndReconsumeIn(source, DataState);
1380 } else {
1381 parseError();
1382 m_token->setForceQuirks();
1383 ADVANCE_TO(BogusDOCTYPEState);
1384 }
1385 }
1386 END_STATE()
1387
1388 BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1389 if (isTokenizerWhitespace(cc))
1390 ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1391 else if (cc == '"') {
1392 m_token->setPublicIdentifierToEmptyString();
1393 ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1394 } else if (cc == '\'') {
1395 m_token->setPublicIdentifierToEmptyString();
1396 ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1397 } else if (cc == '>') {
1398 parseError();
1399 m_token->setForceQuirks();
1400 return emitAndResumeIn(source, DataState);
1401 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1402 parseError();
1403 m_token->setForceQuirks();
1404 return emitAndReconsumeIn(source, DataState);
1405 } else {
1406 parseError();
1407 m_token->setForceQuirks();
1408 ADVANCE_TO(BogusDOCTYPEState);
1409 }
1410 }
1411 END_STATE()
1412
1413 BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1414 if (cc == '"')
1415 ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1416 else if (cc == '>') {
1417 parseError();
1418 m_token->setForceQuirks();
1419 return emitAndResumeIn(source, DataState);
1420 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1421 parseError();
1422 m_token->setForceQuirks();
1423 return emitAndReconsumeIn(source, DataState);
1424 } else {
1425 m_token->appendToPublicIdentifier(cc);
1426 ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1427 }
1428 }
1429 END_STATE()
1430
1431 BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1432 if (cc == '\'')
1433 ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1434 else if (cc == '>') {
1435 parseError();
1436 m_token->setForceQuirks();
1437 return emitAndResumeIn(source, DataState);
1438 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1439 parseError();
1440 m_token->setForceQuirks();
1441 return emitAndReconsumeIn(source, DataState);
1442 } else {
1443 m_token->appendToPublicIdentifier(cc);
1444 ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1445 }
1446 }
1447 END_STATE()
1448
1449 BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1450 if (isTokenizerWhitespace(cc))
1451 ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1452 else if (cc == '>')
1453 return emitAndResumeIn(source, DataState);
1454 else if (cc == '"') {
1455 parseError();
1456 m_token->setSystemIdentifierToEmptyString();
1457 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1458 } else if (cc == '\'') {
1459 parseError();
1460 m_token->setSystemIdentifierToEmptyString();
1461 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1462 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1463 parseError();
1464 m_token->setForceQuirks();
1465 return emitAndReconsumeIn(source, DataState);
1466 } else {
1467 parseError();
1468 m_token->setForceQuirks();
1469 ADVANCE_TO(BogusDOCTYPEState);
1470 }
1471 }
1472 END_STATE()
1473
1474 BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1475 if (isTokenizerWhitespace(cc))
1476 ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1477 else if (cc == '>')
1478 return emitAndResumeIn(source, DataState);
1479 else if (cc == '"') {
1480 m_token->setSystemIdentifierToEmptyString();
1481 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1482 } else if (cc == '\'') {
1483 m_token->setSystemIdentifierToEmptyString();
1484 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1485 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1486 parseError();
1487 m_token->setForceQuirks();
1488 return emitAndReconsumeIn(source, DataState);
1489 } else {
1490 parseError();
1491 m_token->setForceQuirks();
1492 ADVANCE_TO(BogusDOCTYPEState);
1493 }
1494 }
1495 END_STATE()
1496
1497 BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1498 if (isTokenizerWhitespace(cc))
1499 ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1500 else if (cc == '"') {
1501 parseError();
1502 m_token->setSystemIdentifierToEmptyString();
1503 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1504 } else if (cc == '\'') {
1505 parseError();
1506 m_token->setSystemIdentifierToEmptyString();
1507 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1508 } else if (cc == '>') {
1509 parseError();
1510 m_token->setForceQuirks();
1511 return emitAndResumeIn(source, DataState);
1512 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1513 parseError();
1514 m_token->setForceQuirks();
1515 return emitAndReconsumeIn(source, DataState);
1516 } else {
1517 parseError();
1518 m_token->setForceQuirks();
1519 ADVANCE_TO(BogusDOCTYPEState);
1520 }
1521 }
1522 END_STATE()
1523
1524 BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1525 if (isTokenizerWhitespace(cc))
1526 ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1527 if (cc == '"') {
1528 m_token->setSystemIdentifierToEmptyString();
1529 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1530 } else if (cc == '\'') {
1531 m_token->setSystemIdentifierToEmptyString();
1532 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1533 } else if (cc == '>') {
1534 parseError();
1535 m_token->setForceQuirks();
1536 return emitAndResumeIn(source, DataState);
1537 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1538 parseError();
1539 m_token->setForceQuirks();
1540 return emitAndReconsumeIn(source, DataState);
1541 } else {
1542 parseError();
1543 m_token->setForceQuirks();
1544 ADVANCE_TO(BogusDOCTYPEState);
1545 }
1546 }
1547 END_STATE()
1548
1549 BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1550 if (cc == '"')
1551 ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1552 else if (cc == '>') {
1553 parseError();
1554 m_token->setForceQuirks();
1555 return emitAndResumeIn(source, DataState);
1556 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1557 parseError();
1558 m_token->setForceQuirks();
1559 return emitAndReconsumeIn(source, DataState);
1560 } else {
1561 m_token->appendToSystemIdentifier(cc);
1562 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1563 }
1564 }
1565 END_STATE()
1566
1567 BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1568 if (cc == '\'')
1569 ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1570 else if (cc == '>') {
1571 parseError();
1572 m_token->setForceQuirks();
1573 return emitAndResumeIn(source, DataState);
1574 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1575 parseError();
1576 m_token->setForceQuirks();
1577 return emitAndReconsumeIn(source, DataState);
1578 } else {
1579 m_token->appendToSystemIdentifier(cc);
1580 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1581 }
1582 }
1583 END_STATE()
1584
1585 BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1586 if (isTokenizerWhitespace(cc))
1587 ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1588 else if (cc == '>')
1589 return emitAndResumeIn(source, DataState);
1590 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1591 parseError();
1592 m_token->setForceQuirks();
1593 return emitAndReconsumeIn(source, DataState);
1594 } else {
1595 parseError();
1596 ADVANCE_TO(BogusDOCTYPEState);
1597 }
1598 }
1599 END_STATE()
1600
1601 BEGIN_STATE(BogusDOCTYPEState) {
1602 if (cc == '>')
1603 return emitAndResumeIn(source, DataState);
1604 else if (cc == InputStreamPreprocessor::endOfFileMarker)
1605 return emitAndReconsumeIn(source, DataState);
1606 ADVANCE_TO(BogusDOCTYPEState);
1607 }
1608 END_STATE()
1609
1610 BEGIN_STATE(CDATASectionState) {
1611 if (cc == ']')
1612 ADVANCE_TO(CDATASectionRightSquareBracketState);
1613 else if (cc == InputStreamPreprocessor::endOfFileMarker)
1614 RECONSUME_IN(DataState);
1615 else {
1616 bufferCharacter(cc);
1617 ADVANCE_TO(CDATASectionState);
1618 }
1619 }
1620 END_STATE()
1621
1622 BEGIN_STATE(CDATASectionRightSquareBracketState) {
1623 if (cc == ']')
1624 ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1625 else {
1626 bufferCharacter(']');
1627 RECONSUME_IN(CDATASectionState);
1628 }
1629 }
1630
1631 BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1632 if (cc == '>')
1633 ADVANCE_TO(DataState);
1634 else {
1635 bufferCharacter(']');
1636 bufferCharacter(']');
1637 RECONSUME_IN(CDATASectionState);
1638 }
1639 }
1640 END_STATE()
1641
1642 }
1643
1644 ASSERT_NOT_REACHED();
1645 return false;
1646 }
1647
updateStateFor(const AtomicString & tagName,Frame * frame)1648 void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame)
1649 {
1650 if (tagName == textareaTag || tagName == titleTag)
1651 setState(RCDATAState);
1652 else if (tagName == plaintextTag)
1653 setState(PLAINTEXTState);
1654 else if (tagName == scriptTag)
1655 setState(ScriptDataState);
1656 else if (tagName == styleTag
1657 || tagName == iframeTag
1658 || tagName == xmpTag
1659 || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame))
1660 || tagName == noframesTag
1661 || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame)))
1662 setState(RAWTEXTState);
1663 }
1664
temporaryBufferIs(const String & expectedString)1665 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1666 {
1667 return vectorEqualsString(m_temporaryBuffer, expectedString);
1668 }
1669
addToPossibleEndTag(UChar cc)1670 inline void HTMLTokenizer::addToPossibleEndTag(UChar cc)
1671 {
1672 ASSERT(isEndTagBufferingState(m_state));
1673 m_bufferedEndTagName.append(cc);
1674 }
1675
isAppropriateEndTag()1676 inline bool HTMLTokenizer::isAppropriateEndTag()
1677 {
1678 return m_bufferedEndTagName == m_appropriateEndTagName;
1679 }
1680
bufferCharacter(UChar character)1681 inline void HTMLTokenizer::bufferCharacter(UChar character)
1682 {
1683 ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
1684 m_token->ensureIsCharacterToken();
1685 m_token->appendToCharacter(character);
1686 }
1687
parseError()1688 inline void HTMLTokenizer::parseError()
1689 {
1690 notImplemented();
1691 }
1692
haveBufferedCharacterToken()1693 inline bool HTMLTokenizer::haveBufferedCharacterToken()
1694 {
1695 return m_token->type() == HTMLToken::Character;
1696 }
1697
1698 }
1699