1 /*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "config.h"
29 #include "core/html/parser/HTMLTokenizer.h"
30
31 #include "core/HTMLNames.h"
32 #include "core/HTMLTokenizerNames.h"
33 #include "core/html/parser/HTMLEntityParser.h"
34 #include "core/html/parser/HTMLParserIdioms.h"
35 #include "core/html/parser/HTMLTreeBuilder.h"
36 #include "platform/NotImplemented.h"
37 #include "core/xml/parser/MarkupTokenizerInlines.h"
38 #include "wtf/ASCIICType.h"
39 #include "wtf/text/AtomicString.h"
40 #include "wtf/unicode/Unicode.h"
41
42 // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used
43 // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe.
44 #undef DEFINE_STATIC_LOCAL
45
46 namespace WebCore {
47
48 using namespace HTMLNames;
49
50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once.
51 // We don't have an HTMLToken.cpp though, so this is the next best place.
nameForAttribute(const HTMLToken::Attribute & attribute) const52 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const
53 {
54 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom);
55 }
56
usesName() const57 bool AtomicHTMLToken::usesName() const
58 {
59 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
60 }
61
usesAttributes() const62 bool AtomicHTMLToken::usesAttributes() const
63 {
64 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
65 }
66
toLowerCase(UChar cc)67 static inline UChar toLowerCase(UChar cc)
68 {
69 ASSERT(isASCIIUpper(cc));
70 const int lowerCaseOffset = 0x20;
71 return cc + lowerCaseOffset;
72 }
73
vectorEqualsString(const Vector<LChar,32> & vector,const String & string)74 static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
75 {
76 if (vector.size() != string.length())
77 return false;
78
79 if (!string.length())
80 return true;
81
82 return equal(string.impl(), vector.data(), vector.size());
83 }
84
isEndTagBufferingState(HTMLTokenizer::State state)85 static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
86 {
87 switch (state) {
88 case HTMLTokenizer::RCDATAEndTagOpenState:
89 case HTMLTokenizer::RCDATAEndTagNameState:
90 case HTMLTokenizer::RAWTEXTEndTagOpenState:
91 case HTMLTokenizer::RAWTEXTEndTagNameState:
92 case HTMLTokenizer::ScriptDataEndTagOpenState:
93 case HTMLTokenizer::ScriptDataEndTagNameState:
94 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
95 case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
96 return true;
97 default:
98 return false;
99 }
100 }
101
102 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
103 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
104 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
105 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
106
HTMLTokenizer(const HTMLParserOptions & options)107 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
108 : m_inputStreamPreprocessor(this)
109 , m_options(options)
110 {
111 reset();
112 }
113
~HTMLTokenizer()114 HTMLTokenizer::~HTMLTokenizer()
115 {
116 }
117
reset()118 void HTMLTokenizer::reset()
119 {
120 m_state = HTMLTokenizer::DataState;
121 m_token = 0;
122 m_forceNullCharacterReplacement = false;
123 m_shouldAllowCDATA = false;
124 m_additionalAllowedCharacter = '\0';
125 }
126
canCreateCheckpoint() const127 bool HTMLTokenizer::canCreateCheckpoint() const
128 {
129 if (!m_appropriateEndTagName.isEmpty())
130 return false;
131 if (!m_temporaryBuffer.isEmpty())
132 return false;
133 if (!m_bufferedEndTagName.isEmpty())
134 return false;
135 return true;
136 }
137
createCheckpoint(Checkpoint & result) const138 void HTMLTokenizer::createCheckpoint(Checkpoint& result) const
139 {
140 ASSERT(canCreateCheckpoint());
141 result.options = m_options;
142 result.state = m_state;
143 result.additionalAllowedCharacter = m_additionalAllowedCharacter;
144 result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine();
145 result.shouldAllowCDATA = m_shouldAllowCDATA;
146 }
147
restoreFromCheckpoint(const Checkpoint & checkpoint)148 void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint)
149 {
150 m_token = 0;
151 m_options = checkpoint.options;
152 m_state = checkpoint.state;
153 m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter;
154 m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine);
155 m_shouldAllowCDATA = checkpoint.shouldAllowCDATA;
156 }
157
processEntity(SegmentedString & source)158 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
159 {
160 bool notEnoughCharacters = false;
161 DecodedHTMLEntity decodedEntity;
162 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
163 if (notEnoughCharacters)
164 return false;
165 if (!success) {
166 ASSERT(decodedEntity.isEmpty());
167 bufferCharacter('&');
168 } else {
169 for (unsigned i = 0; i < decodedEntity.length; ++i)
170 bufferCharacter(decodedEntity.data[i]);
171 }
172 return true;
173 }
174
flushBufferedEndTag(SegmentedString & source)175 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
176 {
177 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
178 source.advanceAndUpdateLineNumber();
179 if (m_token->type() == HTMLToken::Character)
180 return true;
181 m_token->beginEndTag(m_bufferedEndTagName);
182 m_bufferedEndTagName.clear();
183 m_appropriateEndTagName.clear();
184 m_temporaryBuffer.clear();
185 return false;
186 }
187
188 #define FLUSH_AND_ADVANCE_TO(stateName) \
189 do { \
190 m_state = HTMLTokenizer::stateName; \
191 if (flushBufferedEndTag(source)) \
192 return true; \
193 if (source.isEmpty() \
194 || !m_inputStreamPreprocessor.peek(source)) \
195 return haveBufferedCharacterToken(); \
196 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
197 goto stateName; \
198 } while (false)
199
flushEmitAndResumeIn(SegmentedString & source,HTMLTokenizer::State state)200 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state)
201 {
202 m_state = state;
203 flushBufferedEndTag(source);
204 return true;
205 }
206
nextToken(SegmentedString & source,HTMLToken & token)207 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
208 {
209 // If we have a token in progress, then we're supposed to be called back
210 // with the same token so we can finish it.
211 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
212 m_token = &token;
213
214 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
215 // FIXME: This should call flushBufferedEndTag().
216 // We started an end tag during our last iteration.
217 m_token->beginEndTag(m_bufferedEndTagName);
218 m_bufferedEndTagName.clear();
219 m_appropriateEndTagName.clear();
220 m_temporaryBuffer.clear();
221 if (m_state == HTMLTokenizer::DataState) {
222 // We're back in the data state, so we must be done with the tag.
223 return true;
224 }
225 }
226
227 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
228 return haveBufferedCharacterToken();
229 UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
230
231 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
232 switch (m_state) {
233 HTML_BEGIN_STATE(DataState) {
234 if (cc == '&')
235 HTML_ADVANCE_TO(CharacterReferenceInDataState);
236 else if (cc == '<') {
237 if (m_token->type() == HTMLToken::Character) {
238 // We have a bunch of character tokens queued up that we
239 // are emitting lazily here.
240 return true;
241 }
242 HTML_ADVANCE_TO(TagOpenState);
243 } else if (cc == kEndOfFileMarker)
244 return emitEndOfFile(source);
245 else {
246 bufferCharacter(cc);
247 HTML_ADVANCE_TO(DataState);
248 }
249 }
250 END_STATE()
251
252 HTML_BEGIN_STATE(CharacterReferenceInDataState) {
253 if (!processEntity(source))
254 return haveBufferedCharacterToken();
255 HTML_SWITCH_TO(DataState);
256 }
257 END_STATE()
258
259 HTML_BEGIN_STATE(RCDATAState) {
260 if (cc == '&')
261 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
262 else if (cc == '<')
263 HTML_ADVANCE_TO(RCDATALessThanSignState);
264 else if (cc == kEndOfFileMarker)
265 return emitEndOfFile(source);
266 else {
267 bufferCharacter(cc);
268 HTML_ADVANCE_TO(RCDATAState);
269 }
270 }
271 END_STATE()
272
273 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
274 if (!processEntity(source))
275 return haveBufferedCharacterToken();
276 HTML_SWITCH_TO(RCDATAState);
277 }
278 END_STATE()
279
280 HTML_BEGIN_STATE(RAWTEXTState) {
281 if (cc == '<')
282 HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
283 else if (cc == kEndOfFileMarker)
284 return emitEndOfFile(source);
285 else {
286 bufferCharacter(cc);
287 HTML_ADVANCE_TO(RAWTEXTState);
288 }
289 }
290 END_STATE()
291
292 HTML_BEGIN_STATE(ScriptDataState) {
293 if (cc == '<')
294 HTML_ADVANCE_TO(ScriptDataLessThanSignState);
295 else if (cc == kEndOfFileMarker)
296 return emitEndOfFile(source);
297 else {
298 bufferCharacter(cc);
299 HTML_ADVANCE_TO(ScriptDataState);
300 }
301 }
302 END_STATE()
303
304 HTML_BEGIN_STATE(PLAINTEXTState) {
305 if (cc == kEndOfFileMarker)
306 return emitEndOfFile(source);
307 bufferCharacter(cc);
308 HTML_ADVANCE_TO(PLAINTEXTState);
309 }
310 END_STATE()
311
312 HTML_BEGIN_STATE(TagOpenState) {
313 if (cc == '!')
314 HTML_ADVANCE_TO(MarkupDeclarationOpenState);
315 else if (cc == '/')
316 HTML_ADVANCE_TO(EndTagOpenState);
317 else if (isASCIIUpper(cc)) {
318 m_token->beginStartTag(toLowerCase(cc));
319 HTML_ADVANCE_TO(TagNameState);
320 } else if (isASCIILower(cc)) {
321 m_token->beginStartTag(cc);
322 HTML_ADVANCE_TO(TagNameState);
323 } else if (cc == '?') {
324 parseError();
325 // The spec consumes the current character before switching
326 // to the bogus comment state, but it's easier to implement
327 // if we reconsume the current character.
328 HTML_RECONSUME_IN(BogusCommentState);
329 } else {
330 parseError();
331 bufferCharacter('<');
332 HTML_RECONSUME_IN(DataState);
333 }
334 }
335 END_STATE()
336
337 HTML_BEGIN_STATE(EndTagOpenState) {
338 if (isASCIIUpper(cc)) {
339 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
340 m_appropriateEndTagName.clear();
341 HTML_ADVANCE_TO(TagNameState);
342 } else if (isASCIILower(cc)) {
343 m_token->beginEndTag(static_cast<LChar>(cc));
344 m_appropriateEndTagName.clear();
345 HTML_ADVANCE_TO(TagNameState);
346 } else if (cc == '>') {
347 parseError();
348 HTML_ADVANCE_TO(DataState);
349 } else if (cc == kEndOfFileMarker) {
350 parseError();
351 bufferCharacter('<');
352 bufferCharacter('/');
353 HTML_RECONSUME_IN(DataState);
354 } else {
355 parseError();
356 HTML_RECONSUME_IN(BogusCommentState);
357 }
358 }
359 END_STATE()
360
361 HTML_BEGIN_STATE(TagNameState) {
362 if (isTokenizerWhitespace(cc))
363 HTML_ADVANCE_TO(BeforeAttributeNameState);
364 else if (cc == '/')
365 HTML_ADVANCE_TO(SelfClosingStartTagState);
366 else if (cc == '>')
367 return emitAndResumeIn(source, HTMLTokenizer::DataState);
368 else if (isASCIIUpper(cc)) {
369 m_token->appendToName(toLowerCase(cc));
370 HTML_ADVANCE_TO(TagNameState);
371 } else if (cc == kEndOfFileMarker) {
372 parseError();
373 HTML_RECONSUME_IN(DataState);
374 } else {
375 m_token->appendToName(cc);
376 HTML_ADVANCE_TO(TagNameState);
377 }
378 }
379 END_STATE()
380
381 HTML_BEGIN_STATE(RCDATALessThanSignState) {
382 if (cc == '/') {
383 m_temporaryBuffer.clear();
384 ASSERT(m_bufferedEndTagName.isEmpty());
385 HTML_ADVANCE_TO(RCDATAEndTagOpenState);
386 } else {
387 bufferCharacter('<');
388 HTML_RECONSUME_IN(RCDATAState);
389 }
390 }
391 END_STATE()
392
393 HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
394 if (isASCIIUpper(cc)) {
395 m_temporaryBuffer.append(static_cast<LChar>(cc));
396 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
397 HTML_ADVANCE_TO(RCDATAEndTagNameState);
398 } else if (isASCIILower(cc)) {
399 m_temporaryBuffer.append(static_cast<LChar>(cc));
400 addToPossibleEndTag(static_cast<LChar>(cc));
401 HTML_ADVANCE_TO(RCDATAEndTagNameState);
402 } else {
403 bufferCharacter('<');
404 bufferCharacter('/');
405 HTML_RECONSUME_IN(RCDATAState);
406 }
407 }
408 END_STATE()
409
410 HTML_BEGIN_STATE(RCDATAEndTagNameState) {
411 if (isASCIIUpper(cc)) {
412 m_temporaryBuffer.append(static_cast<LChar>(cc));
413 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
414 HTML_ADVANCE_TO(RCDATAEndTagNameState);
415 } else if (isASCIILower(cc)) {
416 m_temporaryBuffer.append(static_cast<LChar>(cc));
417 addToPossibleEndTag(static_cast<LChar>(cc));
418 HTML_ADVANCE_TO(RCDATAEndTagNameState);
419 } else {
420 if (isTokenizerWhitespace(cc)) {
421 if (isAppropriateEndTag()) {
422 m_temporaryBuffer.append(static_cast<LChar>(cc));
423 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
424 }
425 } else if (cc == '/') {
426 if (isAppropriateEndTag()) {
427 m_temporaryBuffer.append(static_cast<LChar>(cc));
428 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
429 }
430 } else if (cc == '>') {
431 if (isAppropriateEndTag()) {
432 m_temporaryBuffer.append(static_cast<LChar>(cc));
433 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
434 }
435 }
436 bufferCharacter('<');
437 bufferCharacter('/');
438 m_token->appendToCharacter(m_temporaryBuffer);
439 m_bufferedEndTagName.clear();
440 m_temporaryBuffer.clear();
441 HTML_RECONSUME_IN(RCDATAState);
442 }
443 }
444 END_STATE()
445
446 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
447 if (cc == '/') {
448 m_temporaryBuffer.clear();
449 ASSERT(m_bufferedEndTagName.isEmpty());
450 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
451 } else {
452 bufferCharacter('<');
453 HTML_RECONSUME_IN(RAWTEXTState);
454 }
455 }
456 END_STATE()
457
458 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
459 if (isASCIIUpper(cc)) {
460 m_temporaryBuffer.append(static_cast<LChar>(cc));
461 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
462 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
463 } else if (isASCIILower(cc)) {
464 m_temporaryBuffer.append(static_cast<LChar>(cc));
465 addToPossibleEndTag(static_cast<LChar>(cc));
466 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
467 } else {
468 bufferCharacter('<');
469 bufferCharacter('/');
470 HTML_RECONSUME_IN(RAWTEXTState);
471 }
472 }
473 END_STATE()
474
475 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
476 if (isASCIIUpper(cc)) {
477 m_temporaryBuffer.append(static_cast<LChar>(cc));
478 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
479 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
480 } else if (isASCIILower(cc)) {
481 m_temporaryBuffer.append(static_cast<LChar>(cc));
482 addToPossibleEndTag(static_cast<LChar>(cc));
483 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
484 } else {
485 if (isTokenizerWhitespace(cc)) {
486 if (isAppropriateEndTag()) {
487 m_temporaryBuffer.append(static_cast<LChar>(cc));
488 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
489 }
490 } else if (cc == '/') {
491 if (isAppropriateEndTag()) {
492 m_temporaryBuffer.append(static_cast<LChar>(cc));
493 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
494 }
495 } else if (cc == '>') {
496 if (isAppropriateEndTag()) {
497 m_temporaryBuffer.append(static_cast<LChar>(cc));
498 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
499 }
500 }
501 bufferCharacter('<');
502 bufferCharacter('/');
503 m_token->appendToCharacter(m_temporaryBuffer);
504 m_bufferedEndTagName.clear();
505 m_temporaryBuffer.clear();
506 HTML_RECONSUME_IN(RAWTEXTState);
507 }
508 }
509 END_STATE()
510
511 HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
512 if (cc == '/') {
513 m_temporaryBuffer.clear();
514 ASSERT(m_bufferedEndTagName.isEmpty());
515 HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
516 } else if (cc == '!') {
517 bufferCharacter('<');
518 bufferCharacter('!');
519 HTML_ADVANCE_TO(ScriptDataEscapeStartState);
520 } else {
521 bufferCharacter('<');
522 HTML_RECONSUME_IN(ScriptDataState);
523 }
524 }
525 END_STATE()
526
527 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
528 if (isASCIIUpper(cc)) {
529 m_temporaryBuffer.append(static_cast<LChar>(cc));
530 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
531 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
532 } else if (isASCIILower(cc)) {
533 m_temporaryBuffer.append(static_cast<LChar>(cc));
534 addToPossibleEndTag(static_cast<LChar>(cc));
535 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
536 } else {
537 bufferCharacter('<');
538 bufferCharacter('/');
539 HTML_RECONSUME_IN(ScriptDataState);
540 }
541 }
542 END_STATE()
543
544 HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
545 if (isASCIIUpper(cc)) {
546 m_temporaryBuffer.append(static_cast<LChar>(cc));
547 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
548 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
549 } else if (isASCIILower(cc)) {
550 m_temporaryBuffer.append(static_cast<LChar>(cc));
551 addToPossibleEndTag(static_cast<LChar>(cc));
552 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
553 } else {
554 if (isTokenizerWhitespace(cc)) {
555 if (isAppropriateEndTag()) {
556 m_temporaryBuffer.append(static_cast<LChar>(cc));
557 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
558 }
559 } else if (cc == '/') {
560 if (isAppropriateEndTag()) {
561 m_temporaryBuffer.append(static_cast<LChar>(cc));
562 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
563 }
564 } else if (cc == '>') {
565 if (isAppropriateEndTag()) {
566 m_temporaryBuffer.append(static_cast<LChar>(cc));
567 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
568 }
569 }
570 bufferCharacter('<');
571 bufferCharacter('/');
572 m_token->appendToCharacter(m_temporaryBuffer);
573 m_bufferedEndTagName.clear();
574 m_temporaryBuffer.clear();
575 HTML_RECONSUME_IN(ScriptDataState);
576 }
577 }
578 END_STATE()
579
580 HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
581 if (cc == '-') {
582 bufferCharacter(cc);
583 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
584 } else
585 HTML_RECONSUME_IN(ScriptDataState);
586 }
587 END_STATE()
588
589 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
590 if (cc == '-') {
591 bufferCharacter(cc);
592 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
593 } else
594 HTML_RECONSUME_IN(ScriptDataState);
595 }
596 END_STATE()
597
598 HTML_BEGIN_STATE(ScriptDataEscapedState) {
599 if (cc == '-') {
600 bufferCharacter(cc);
601 HTML_ADVANCE_TO(ScriptDataEscapedDashState);
602 } else if (cc == '<')
603 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
604 else if (cc == kEndOfFileMarker) {
605 parseError();
606 HTML_RECONSUME_IN(DataState);
607 } else {
608 bufferCharacter(cc);
609 HTML_ADVANCE_TO(ScriptDataEscapedState);
610 }
611 }
612 END_STATE()
613
614 HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
615 if (cc == '-') {
616 bufferCharacter(cc);
617 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
618 } else if (cc == '<')
619 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
620 else if (cc == kEndOfFileMarker) {
621 parseError();
622 HTML_RECONSUME_IN(DataState);
623 } else {
624 bufferCharacter(cc);
625 HTML_ADVANCE_TO(ScriptDataEscapedState);
626 }
627 }
628 END_STATE()
629
630 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
631 if (cc == '-') {
632 bufferCharacter(cc);
633 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
634 } else if (cc == '<')
635 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
636 else if (cc == '>') {
637 bufferCharacter(cc);
638 HTML_ADVANCE_TO(ScriptDataState);
639 } else if (cc == kEndOfFileMarker) {
640 parseError();
641 HTML_RECONSUME_IN(DataState);
642 } else {
643 bufferCharacter(cc);
644 HTML_ADVANCE_TO(ScriptDataEscapedState);
645 }
646 }
647 END_STATE()
648
649 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
650 if (cc == '/') {
651 m_temporaryBuffer.clear();
652 ASSERT(m_bufferedEndTagName.isEmpty());
653 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
654 } else if (isASCIIUpper(cc)) {
655 bufferCharacter('<');
656 bufferCharacter(cc);
657 m_temporaryBuffer.clear();
658 m_temporaryBuffer.append(toLowerCase(cc));
659 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
660 } else if (isASCIILower(cc)) {
661 bufferCharacter('<');
662 bufferCharacter(cc);
663 m_temporaryBuffer.clear();
664 m_temporaryBuffer.append(static_cast<LChar>(cc));
665 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
666 } else {
667 bufferCharacter('<');
668 HTML_RECONSUME_IN(ScriptDataEscapedState);
669 }
670 }
671 END_STATE()
672
673 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
674 if (isASCIIUpper(cc)) {
675 m_temporaryBuffer.append(static_cast<LChar>(cc));
676 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
677 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
678 } else if (isASCIILower(cc)) {
679 m_temporaryBuffer.append(static_cast<LChar>(cc));
680 addToPossibleEndTag(static_cast<LChar>(cc));
681 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
682 } else {
683 bufferCharacter('<');
684 bufferCharacter('/');
685 HTML_RECONSUME_IN(ScriptDataEscapedState);
686 }
687 }
688 END_STATE()
689
690 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
691 if (isASCIIUpper(cc)) {
692 m_temporaryBuffer.append(static_cast<LChar>(cc));
693 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
694 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
695 } else if (isASCIILower(cc)) {
696 m_temporaryBuffer.append(static_cast<LChar>(cc));
697 addToPossibleEndTag(static_cast<LChar>(cc));
698 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
699 } else {
700 if (isTokenizerWhitespace(cc)) {
701 if (isAppropriateEndTag()) {
702 m_temporaryBuffer.append(static_cast<LChar>(cc));
703 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
704 }
705 } else if (cc == '/') {
706 if (isAppropriateEndTag()) {
707 m_temporaryBuffer.append(static_cast<LChar>(cc));
708 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
709 }
710 } else if (cc == '>') {
711 if (isAppropriateEndTag()) {
712 m_temporaryBuffer.append(static_cast<LChar>(cc));
713 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
714 }
715 }
716 bufferCharacter('<');
717 bufferCharacter('/');
718 m_token->appendToCharacter(m_temporaryBuffer);
719 m_bufferedEndTagName.clear();
720 m_temporaryBuffer.clear();
721 HTML_RECONSUME_IN(ScriptDataEscapedState);
722 }
723 }
724 END_STATE()
725
726 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
727 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
728 bufferCharacter(cc);
729 if (temporaryBufferIs(scriptTag.localName()))
730 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
731 else
732 HTML_ADVANCE_TO(ScriptDataEscapedState);
733 } else if (isASCIIUpper(cc)) {
734 bufferCharacter(cc);
735 m_temporaryBuffer.append(toLowerCase(cc));
736 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
737 } else if (isASCIILower(cc)) {
738 bufferCharacter(cc);
739 m_temporaryBuffer.append(static_cast<LChar>(cc));
740 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
741 } else
742 HTML_RECONSUME_IN(ScriptDataEscapedState);
743 }
744 END_STATE()
745
746 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
747 if (cc == '-') {
748 bufferCharacter(cc);
749 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
750 } else if (cc == '<') {
751 bufferCharacter(cc);
752 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
753 } else if (cc == kEndOfFileMarker) {
754 parseError();
755 HTML_RECONSUME_IN(DataState);
756 } else {
757 bufferCharacter(cc);
758 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
759 }
760 }
761 END_STATE()
762
763 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
764 if (cc == '-') {
765 bufferCharacter(cc);
766 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
767 } else if (cc == '<') {
768 bufferCharacter(cc);
769 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
770 } else if (cc == kEndOfFileMarker) {
771 parseError();
772 HTML_RECONSUME_IN(DataState);
773 } else {
774 bufferCharacter(cc);
775 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
776 }
777 }
778 END_STATE()
779
780 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
781 if (cc == '-') {
782 bufferCharacter(cc);
783 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
784 } else if (cc == '<') {
785 bufferCharacter(cc);
786 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
787 } else if (cc == '>') {
788 bufferCharacter(cc);
789 HTML_ADVANCE_TO(ScriptDataState);
790 } else if (cc == kEndOfFileMarker) {
791 parseError();
792 HTML_RECONSUME_IN(DataState);
793 } else {
794 bufferCharacter(cc);
795 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
796 }
797 }
798 END_STATE()
799
800 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
801 if (cc == '/') {
802 bufferCharacter(cc);
803 m_temporaryBuffer.clear();
804 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
805 } else
806 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
807 }
808 END_STATE()
809
810 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
811 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
812 bufferCharacter(cc);
813 if (temporaryBufferIs(scriptTag.localName()))
814 HTML_ADVANCE_TO(ScriptDataEscapedState);
815 else
816 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
817 } else if (isASCIIUpper(cc)) {
818 bufferCharacter(cc);
819 m_temporaryBuffer.append(toLowerCase(cc));
820 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
821 } else if (isASCIILower(cc)) {
822 bufferCharacter(cc);
823 m_temporaryBuffer.append(static_cast<LChar>(cc));
824 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
825 } else
826 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
827 }
828 END_STATE()
829
830 HTML_BEGIN_STATE(BeforeAttributeNameState) {
831 if (isTokenizerWhitespace(cc))
832 HTML_ADVANCE_TO(BeforeAttributeNameState);
833 else if (cc == '/')
834 HTML_ADVANCE_TO(SelfClosingStartTagState);
835 else if (cc == '>')
836 return emitAndResumeIn(source, HTMLTokenizer::DataState);
837 else if (isASCIIUpper(cc)) {
838 m_token->addNewAttribute();
839 m_token->beginAttributeName(source.numberOfCharactersConsumed());
840 m_token->appendToAttributeName(toLowerCase(cc));
841 HTML_ADVANCE_TO(AttributeNameState);
842 } else if (cc == kEndOfFileMarker) {
843 parseError();
844 HTML_RECONSUME_IN(DataState);
845 } else {
846 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
847 parseError();
848 m_token->addNewAttribute();
849 m_token->beginAttributeName(source.numberOfCharactersConsumed());
850 m_token->appendToAttributeName(cc);
851 HTML_ADVANCE_TO(AttributeNameState);
852 }
853 }
854 END_STATE()
855
856 HTML_BEGIN_STATE(AttributeNameState) {
857 if (isTokenizerWhitespace(cc)) {
858 m_token->endAttributeName(source.numberOfCharactersConsumed());
859 HTML_ADVANCE_TO(AfterAttributeNameState);
860 } else if (cc == '/') {
861 m_token->endAttributeName(source.numberOfCharactersConsumed());
862 HTML_ADVANCE_TO(SelfClosingStartTagState);
863 } else if (cc == '=') {
864 m_token->endAttributeName(source.numberOfCharactersConsumed());
865 HTML_ADVANCE_TO(BeforeAttributeValueState);
866 } else if (cc == '>') {
867 m_token->endAttributeName(source.numberOfCharactersConsumed());
868 return emitAndResumeIn(source, HTMLTokenizer::DataState);
869 } else if (isASCIIUpper(cc)) {
870 m_token->appendToAttributeName(toLowerCase(cc));
871 HTML_ADVANCE_TO(AttributeNameState);
872 } else if (cc == kEndOfFileMarker) {
873 parseError();
874 m_token->endAttributeName(source.numberOfCharactersConsumed());
875 HTML_RECONSUME_IN(DataState);
876 } else {
877 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
878 parseError();
879 m_token->appendToAttributeName(cc);
880 HTML_ADVANCE_TO(AttributeNameState);
881 }
882 }
883 END_STATE()
884
885 HTML_BEGIN_STATE(AfterAttributeNameState) {
886 if (isTokenizerWhitespace(cc))
887 HTML_ADVANCE_TO(AfterAttributeNameState);
888 else if (cc == '/')
889 HTML_ADVANCE_TO(SelfClosingStartTagState);
890 else if (cc == '=')
891 HTML_ADVANCE_TO(BeforeAttributeValueState);
892 else if (cc == '>')
893 return emitAndResumeIn(source, HTMLTokenizer::DataState);
894 else if (isASCIIUpper(cc)) {
895 m_token->addNewAttribute();
896 m_token->beginAttributeName(source.numberOfCharactersConsumed());
897 m_token->appendToAttributeName(toLowerCase(cc));
898 HTML_ADVANCE_TO(AttributeNameState);
899 } else if (cc == kEndOfFileMarker) {
900 parseError();
901 HTML_RECONSUME_IN(DataState);
902 } else {
903 if (cc == '"' || cc == '\'' || cc == '<')
904 parseError();
905 m_token->addNewAttribute();
906 m_token->beginAttributeName(source.numberOfCharactersConsumed());
907 m_token->appendToAttributeName(cc);
908 HTML_ADVANCE_TO(AttributeNameState);
909 }
910 }
911 END_STATE()
912
913 HTML_BEGIN_STATE(BeforeAttributeValueState) {
914 if (isTokenizerWhitespace(cc))
915 HTML_ADVANCE_TO(BeforeAttributeValueState);
916 else if (cc == '"') {
917 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
918 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
919 } else if (cc == '&') {
920 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
921 HTML_RECONSUME_IN(AttributeValueUnquotedState);
922 } else if (cc == '\'') {
923 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
924 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
925 } else if (cc == '>') {
926 parseError();
927 return emitAndResumeIn(source, HTMLTokenizer::DataState);
928 } else if (cc == kEndOfFileMarker) {
929 parseError();
930 HTML_RECONSUME_IN(DataState);
931 } else {
932 if (cc == '<' || cc == '=' || cc == '`')
933 parseError();
934 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
935 m_token->appendToAttributeValue(cc);
936 HTML_ADVANCE_TO(AttributeValueUnquotedState);
937 }
938 }
939 END_STATE()
940
941 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
942 if (cc == '"') {
943 m_token->endAttributeValue(source.numberOfCharactersConsumed());
944 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
945 } else if (cc == '&') {
946 m_additionalAllowedCharacter = '"';
947 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
948 } else if (cc == kEndOfFileMarker) {
949 parseError();
950 m_token->endAttributeValue(source.numberOfCharactersConsumed());
951 HTML_RECONSUME_IN(DataState);
952 } else {
953 m_token->appendToAttributeValue(cc);
954 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
955 }
956 }
957 END_STATE()
958
959 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
960 if (cc == '\'') {
961 m_token->endAttributeValue(source.numberOfCharactersConsumed());
962 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
963 } else if (cc == '&') {
964 m_additionalAllowedCharacter = '\'';
965 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
966 } else if (cc == kEndOfFileMarker) {
967 parseError();
968 m_token->endAttributeValue(source.numberOfCharactersConsumed());
969 HTML_RECONSUME_IN(DataState);
970 } else {
971 m_token->appendToAttributeValue(cc);
972 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
973 }
974 }
975 END_STATE()
976
977 HTML_BEGIN_STATE(AttributeValueUnquotedState) {
978 if (isTokenizerWhitespace(cc)) {
979 m_token->endAttributeValue(source.numberOfCharactersConsumed());
980 HTML_ADVANCE_TO(BeforeAttributeNameState);
981 } else if (cc == '&') {
982 m_additionalAllowedCharacter = '>';
983 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
984 } else if (cc == '>') {
985 m_token->endAttributeValue(source.numberOfCharactersConsumed());
986 return emitAndResumeIn(source, HTMLTokenizer::DataState);
987 } else if (cc == kEndOfFileMarker) {
988 parseError();
989 m_token->endAttributeValue(source.numberOfCharactersConsumed());
990 HTML_RECONSUME_IN(DataState);
991 } else {
992 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
993 parseError();
994 m_token->appendToAttributeValue(cc);
995 HTML_ADVANCE_TO(AttributeValueUnquotedState);
996 }
997 }
998 END_STATE()
999
1000 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
1001 bool notEnoughCharacters = false;
1002 DecodedHTMLEntity decodedEntity;
1003 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
1004 if (notEnoughCharacters)
1005 return haveBufferedCharacterToken();
1006 if (!success) {
1007 ASSERT(decodedEntity.isEmpty());
1008 m_token->appendToAttributeValue('&');
1009 } else {
1010 for (unsigned i = 0; i < decodedEntity.length; ++i)
1011 m_token->appendToAttributeValue(decodedEntity.data[i]);
1012 }
1013 // We're supposed to switch back to the attribute value state that
1014 // we were in when we were switched into this state. Rather than
1015 // keeping track of this explictly, we observe that the previous
1016 // state can be determined by m_additionalAllowedCharacter.
1017 if (m_additionalAllowedCharacter == '"')
1018 HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
1019 else if (m_additionalAllowedCharacter == '\'')
1020 HTML_SWITCH_TO(AttributeValueSingleQuotedState);
1021 else if (m_additionalAllowedCharacter == '>')
1022 HTML_SWITCH_TO(AttributeValueUnquotedState);
1023 else
1024 ASSERT_NOT_REACHED();
1025 }
1026 END_STATE()
1027
1028 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
1029 if (isTokenizerWhitespace(cc))
1030 HTML_ADVANCE_TO(BeforeAttributeNameState);
1031 else if (cc == '/')
1032 HTML_ADVANCE_TO(SelfClosingStartTagState);
1033 else if (cc == '>')
1034 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1035 else if (cc == kEndOfFileMarker) {
1036 parseError();
1037 HTML_RECONSUME_IN(DataState);
1038 } else {
1039 parseError();
1040 HTML_RECONSUME_IN(BeforeAttributeNameState);
1041 }
1042 }
1043 END_STATE()
1044
1045 HTML_BEGIN_STATE(SelfClosingStartTagState) {
1046 if (cc == '>') {
1047 m_token->setSelfClosing();
1048 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1049 } else if (cc == kEndOfFileMarker) {
1050 parseError();
1051 HTML_RECONSUME_IN(DataState);
1052 } else {
1053 parseError();
1054 HTML_RECONSUME_IN(BeforeAttributeNameState);
1055 }
1056 }
1057 END_STATE()
1058
1059 HTML_BEGIN_STATE(BogusCommentState) {
1060 m_token->beginComment();
1061 HTML_RECONSUME_IN(ContinueBogusCommentState);
1062 }
1063 END_STATE()
1064
1065 HTML_BEGIN_STATE(ContinueBogusCommentState) {
1066 if (cc == '>')
1067 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1068 else if (cc == kEndOfFileMarker)
1069 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1070 else {
1071 m_token->appendToComment(cc);
1072 HTML_ADVANCE_TO(ContinueBogusCommentState);
1073 }
1074 }
1075 END_STATE()
1076
1077 HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1078 if (cc == '-') {
1079 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash);
1080 if (result == SegmentedString::DidMatch) {
1081 source.advanceAndASSERT('-');
1082 source.advanceAndASSERT('-');
1083 m_token->beginComment();
1084 HTML_SWITCH_TO(CommentStartState);
1085 } else if (result == SegmentedString::NotEnoughCharacters)
1086 return haveBufferedCharacterToken();
1087 } else if (cc == 'D' || cc == 'd') {
1088 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::doctype);
1089 if (result == SegmentedString::DidMatch) {
1090 advanceStringAndASSERTIgnoringCase(source, "doctype");
1091 HTML_SWITCH_TO(DOCTYPEState);
1092 } else if (result == SegmentedString::NotEnoughCharacters)
1093 return haveBufferedCharacterToken();
1094 } else if (cc == '[' && shouldAllowCDATA()) {
1095 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::cdata);
1096 if (result == SegmentedString::DidMatch) {
1097 advanceStringAndASSERT(source, "[CDATA[");
1098 HTML_SWITCH_TO(CDATASectionState);
1099 } else if (result == SegmentedString::NotEnoughCharacters)
1100 return haveBufferedCharacterToken();
1101 }
1102 parseError();
1103 HTML_RECONSUME_IN(BogusCommentState);
1104 }
1105 END_STATE()
1106
1107 HTML_BEGIN_STATE(CommentStartState) {
1108 if (cc == '-')
1109 HTML_ADVANCE_TO(CommentStartDashState);
1110 else if (cc == '>') {
1111 parseError();
1112 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1113 } else if (cc == kEndOfFileMarker) {
1114 parseError();
1115 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1116 } else {
1117 m_token->appendToComment(cc);
1118 HTML_ADVANCE_TO(CommentState);
1119 }
1120 }
1121 END_STATE()
1122
1123 HTML_BEGIN_STATE(CommentStartDashState) {
1124 if (cc == '-')
1125 HTML_ADVANCE_TO(CommentEndState);
1126 else if (cc == '>') {
1127 parseError();
1128 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1129 } else if (cc == kEndOfFileMarker) {
1130 parseError();
1131 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1132 } else {
1133 m_token->appendToComment('-');
1134 m_token->appendToComment(cc);
1135 HTML_ADVANCE_TO(CommentState);
1136 }
1137 }
1138 END_STATE()
1139
1140 HTML_BEGIN_STATE(CommentState) {
1141 if (cc == '-')
1142 HTML_ADVANCE_TO(CommentEndDashState);
1143 else if (cc == kEndOfFileMarker) {
1144 parseError();
1145 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1146 } else {
1147 m_token->appendToComment(cc);
1148 HTML_ADVANCE_TO(CommentState);
1149 }
1150 }
1151 END_STATE()
1152
1153 HTML_BEGIN_STATE(CommentEndDashState) {
1154 if (cc == '-')
1155 HTML_ADVANCE_TO(CommentEndState);
1156 else if (cc == kEndOfFileMarker) {
1157 parseError();
1158 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1159 } else {
1160 m_token->appendToComment('-');
1161 m_token->appendToComment(cc);
1162 HTML_ADVANCE_TO(CommentState);
1163 }
1164 }
1165 END_STATE()
1166
1167 HTML_BEGIN_STATE(CommentEndState) {
1168 if (cc == '>')
1169 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1170 else if (cc == '!') {
1171 parseError();
1172 HTML_ADVANCE_TO(CommentEndBangState);
1173 } else if (cc == '-') {
1174 parseError();
1175 m_token->appendToComment('-');
1176 HTML_ADVANCE_TO(CommentEndState);
1177 } else if (cc == kEndOfFileMarker) {
1178 parseError();
1179 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1180 } else {
1181 parseError();
1182 m_token->appendToComment('-');
1183 m_token->appendToComment('-');
1184 m_token->appendToComment(cc);
1185 HTML_ADVANCE_TO(CommentState);
1186 }
1187 }
1188 END_STATE()
1189
1190 HTML_BEGIN_STATE(CommentEndBangState) {
1191 if (cc == '-') {
1192 m_token->appendToComment('-');
1193 m_token->appendToComment('-');
1194 m_token->appendToComment('!');
1195 HTML_ADVANCE_TO(CommentEndDashState);
1196 } else if (cc == '>')
1197 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1198 else if (cc == kEndOfFileMarker) {
1199 parseError();
1200 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1201 } else {
1202 m_token->appendToComment('-');
1203 m_token->appendToComment('-');
1204 m_token->appendToComment('!');
1205 m_token->appendToComment(cc);
1206 HTML_ADVANCE_TO(CommentState);
1207 }
1208 }
1209 END_STATE()
1210
1211 HTML_BEGIN_STATE(DOCTYPEState) {
1212 if (isTokenizerWhitespace(cc))
1213 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1214 else if (cc == kEndOfFileMarker) {
1215 parseError();
1216 m_token->beginDOCTYPE();
1217 m_token->setForceQuirks();
1218 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1219 } else {
1220 parseError();
1221 HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1222 }
1223 }
1224 END_STATE()
1225
1226 HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1227 if (isTokenizerWhitespace(cc))
1228 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1229 else if (isASCIIUpper(cc)) {
1230 m_token->beginDOCTYPE(toLowerCase(cc));
1231 HTML_ADVANCE_TO(DOCTYPENameState);
1232 } else if (cc == '>') {
1233 parseError();
1234 m_token->beginDOCTYPE();
1235 m_token->setForceQuirks();
1236 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1237 } else if (cc == kEndOfFileMarker) {
1238 parseError();
1239 m_token->beginDOCTYPE();
1240 m_token->setForceQuirks();
1241 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1242 } else {
1243 m_token->beginDOCTYPE(cc);
1244 HTML_ADVANCE_TO(DOCTYPENameState);
1245 }
1246 }
1247 END_STATE()
1248
1249 HTML_BEGIN_STATE(DOCTYPENameState) {
1250 if (isTokenizerWhitespace(cc))
1251 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1252 else if (cc == '>')
1253 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1254 else if (isASCIIUpper(cc)) {
1255 m_token->appendToName(toLowerCase(cc));
1256 HTML_ADVANCE_TO(DOCTYPENameState);
1257 } else if (cc == kEndOfFileMarker) {
1258 parseError();
1259 m_token->setForceQuirks();
1260 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1261 } else {
1262 m_token->appendToName(cc);
1263 HTML_ADVANCE_TO(DOCTYPENameState);
1264 }
1265 }
1266 END_STATE()
1267
1268 HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1269 if (isTokenizerWhitespace(cc))
1270 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1271 if (cc == '>')
1272 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1273 else if (cc == kEndOfFileMarker) {
1274 parseError();
1275 m_token->setForceQuirks();
1276 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1277 } else {
1278 if (cc == 'P' || cc == 'p') {
1279 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::publicString);
1280 if (result == SegmentedString::DidMatch) {
1281 advanceStringAndASSERTIgnoringCase(source, "public");
1282 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1283 } else if (result == SegmentedString::NotEnoughCharacters)
1284 return haveBufferedCharacterToken();
1285 } else if (cc == 'S' || cc == 's') {
1286 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::system);
1287 if (result == SegmentedString::DidMatch) {
1288 advanceStringAndASSERTIgnoringCase(source, "system");
1289 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1290 } else if (result == SegmentedString::NotEnoughCharacters)
1291 return haveBufferedCharacterToken();
1292 }
1293 parseError();
1294 m_token->setForceQuirks();
1295 HTML_ADVANCE_TO(BogusDOCTYPEState);
1296 }
1297 }
1298 END_STATE()
1299
1300 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1301 if (isTokenizerWhitespace(cc))
1302 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1303 else if (cc == '"') {
1304 parseError();
1305 m_token->setPublicIdentifierToEmptyString();
1306 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1307 } else if (cc == '\'') {
1308 parseError();
1309 m_token->setPublicIdentifierToEmptyString();
1310 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1311 } else if (cc == '>') {
1312 parseError();
1313 m_token->setForceQuirks();
1314 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1315 } else if (cc == kEndOfFileMarker) {
1316 parseError();
1317 m_token->setForceQuirks();
1318 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1319 } else {
1320 parseError();
1321 m_token->setForceQuirks();
1322 HTML_ADVANCE_TO(BogusDOCTYPEState);
1323 }
1324 }
1325 END_STATE()
1326
1327 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1328 if (isTokenizerWhitespace(cc))
1329 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1330 else if (cc == '"') {
1331 m_token->setPublicIdentifierToEmptyString();
1332 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1333 } else if (cc == '\'') {
1334 m_token->setPublicIdentifierToEmptyString();
1335 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1336 } else if (cc == '>') {
1337 parseError();
1338 m_token->setForceQuirks();
1339 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1340 } else if (cc == kEndOfFileMarker) {
1341 parseError();
1342 m_token->setForceQuirks();
1343 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1344 } else {
1345 parseError();
1346 m_token->setForceQuirks();
1347 HTML_ADVANCE_TO(BogusDOCTYPEState);
1348 }
1349 }
1350 END_STATE()
1351
1352 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1353 if (cc == '"')
1354 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1355 else if (cc == '>') {
1356 parseError();
1357 m_token->setForceQuirks();
1358 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1359 } else if (cc == kEndOfFileMarker) {
1360 parseError();
1361 m_token->setForceQuirks();
1362 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1363 } else {
1364 m_token->appendToPublicIdentifier(cc);
1365 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1366 }
1367 }
1368 END_STATE()
1369
1370 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1371 if (cc == '\'')
1372 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1373 else if (cc == '>') {
1374 parseError();
1375 m_token->setForceQuirks();
1376 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1377 } else if (cc == kEndOfFileMarker) {
1378 parseError();
1379 m_token->setForceQuirks();
1380 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1381 } else {
1382 m_token->appendToPublicIdentifier(cc);
1383 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1384 }
1385 }
1386 END_STATE()
1387
1388 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1389 if (isTokenizerWhitespace(cc))
1390 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1391 else if (cc == '>')
1392 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1393 else if (cc == '"') {
1394 parseError();
1395 m_token->setSystemIdentifierToEmptyString();
1396 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1397 } else if (cc == '\'') {
1398 parseError();
1399 m_token->setSystemIdentifierToEmptyString();
1400 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1401 } else if (cc == kEndOfFileMarker) {
1402 parseError();
1403 m_token->setForceQuirks();
1404 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1405 } else {
1406 parseError();
1407 m_token->setForceQuirks();
1408 HTML_ADVANCE_TO(BogusDOCTYPEState);
1409 }
1410 }
1411 END_STATE()
1412
1413 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1414 if (isTokenizerWhitespace(cc))
1415 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1416 else if (cc == '>')
1417 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1418 else if (cc == '"') {
1419 m_token->setSystemIdentifierToEmptyString();
1420 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1421 } else if (cc == '\'') {
1422 m_token->setSystemIdentifierToEmptyString();
1423 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1424 } else if (cc == kEndOfFileMarker) {
1425 parseError();
1426 m_token->setForceQuirks();
1427 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1428 } else {
1429 parseError();
1430 m_token->setForceQuirks();
1431 HTML_ADVANCE_TO(BogusDOCTYPEState);
1432 }
1433 }
1434 END_STATE()
1435
1436 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1437 if (isTokenizerWhitespace(cc))
1438 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1439 else if (cc == '"') {
1440 parseError();
1441 m_token->setSystemIdentifierToEmptyString();
1442 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1443 } else if (cc == '\'') {
1444 parseError();
1445 m_token->setSystemIdentifierToEmptyString();
1446 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1447 } else if (cc == '>') {
1448 parseError();
1449 m_token->setForceQuirks();
1450 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1451 } else if (cc == kEndOfFileMarker) {
1452 parseError();
1453 m_token->setForceQuirks();
1454 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1455 } else {
1456 parseError();
1457 m_token->setForceQuirks();
1458 HTML_ADVANCE_TO(BogusDOCTYPEState);
1459 }
1460 }
1461 END_STATE()
1462
1463 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1464 if (isTokenizerWhitespace(cc))
1465 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1466 if (cc == '"') {
1467 m_token->setSystemIdentifierToEmptyString();
1468 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1469 } else if (cc == '\'') {
1470 m_token->setSystemIdentifierToEmptyString();
1471 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1472 } else if (cc == '>') {
1473 parseError();
1474 m_token->setForceQuirks();
1475 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1476 } else if (cc == kEndOfFileMarker) {
1477 parseError();
1478 m_token->setForceQuirks();
1479 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1480 } else {
1481 parseError();
1482 m_token->setForceQuirks();
1483 HTML_ADVANCE_TO(BogusDOCTYPEState);
1484 }
1485 }
1486 END_STATE()
1487
1488 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1489 if (cc == '"')
1490 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1491 else if (cc == '>') {
1492 parseError();
1493 m_token->setForceQuirks();
1494 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1495 } else if (cc == kEndOfFileMarker) {
1496 parseError();
1497 m_token->setForceQuirks();
1498 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1499 } else {
1500 m_token->appendToSystemIdentifier(cc);
1501 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1502 }
1503 }
1504 END_STATE()
1505
1506 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1507 if (cc == '\'')
1508 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1509 else if (cc == '>') {
1510 parseError();
1511 m_token->setForceQuirks();
1512 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1513 } else if (cc == kEndOfFileMarker) {
1514 parseError();
1515 m_token->setForceQuirks();
1516 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1517 } else {
1518 m_token->appendToSystemIdentifier(cc);
1519 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1520 }
1521 }
1522 END_STATE()
1523
1524 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1525 if (isTokenizerWhitespace(cc))
1526 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1527 else if (cc == '>')
1528 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1529 else if (cc == kEndOfFileMarker) {
1530 parseError();
1531 m_token->setForceQuirks();
1532 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1533 } else {
1534 parseError();
1535 HTML_ADVANCE_TO(BogusDOCTYPEState);
1536 }
1537 }
1538 END_STATE()
1539
1540 HTML_BEGIN_STATE(BogusDOCTYPEState) {
1541 if (cc == '>')
1542 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1543 else if (cc == kEndOfFileMarker)
1544 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1545 HTML_ADVANCE_TO(BogusDOCTYPEState);
1546 }
1547 END_STATE()
1548
1549 HTML_BEGIN_STATE(CDATASectionState) {
1550 if (cc == ']')
1551 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1552 else if (cc == kEndOfFileMarker)
1553 HTML_RECONSUME_IN(DataState);
1554 else {
1555 bufferCharacter(cc);
1556 HTML_ADVANCE_TO(CDATASectionState);
1557 }
1558 }
1559 END_STATE()
1560
1561 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1562 if (cc == ']')
1563 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1564 else {
1565 bufferCharacter(']');
1566 HTML_RECONSUME_IN(CDATASectionState);
1567 }
1568 }
1569
1570 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1571 if (cc == '>')
1572 HTML_ADVANCE_TO(DataState);
1573 else {
1574 bufferCharacter(']');
1575 bufferCharacter(']');
1576 HTML_RECONSUME_IN(CDATASectionState);
1577 }
1578 }
1579 END_STATE()
1580
1581 }
1582
1583 ASSERT_NOT_REACHED();
1584 return false;
1585 }
1586
bufferedCharacters() const1587 String HTMLTokenizer::bufferedCharacters() const
1588 {
1589 // FIXME: Add an assert about m_state.
1590 StringBuilder characters;
1591 characters.reserveCapacity(numberOfBufferedCharacters());
1592 characters.append('<');
1593 characters.append('/');
1594 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size());
1595 return characters.toString();
1596 }
1597
updateStateFor(const String & tagName)1598 void HTMLTokenizer::updateStateFor(const String& tagName)
1599 {
1600 if (threadSafeMatch(tagName, textareaTag) || threadSafeMatch(tagName, titleTag))
1601 setState(HTMLTokenizer::RCDATAState);
1602 else if (threadSafeMatch(tagName, plaintextTag))
1603 setState(HTMLTokenizer::PLAINTEXTState);
1604 else if (threadSafeMatch(tagName, scriptTag))
1605 setState(HTMLTokenizer::ScriptDataState);
1606 else if (threadSafeMatch(tagName, styleTag)
1607 || threadSafeMatch(tagName, iframeTag)
1608 || threadSafeMatch(tagName, xmpTag)
1609 || (threadSafeMatch(tagName, noembedTag) && m_options.pluginsEnabled)
1610 || threadSafeMatch(tagName, noframesTag)
1611 || (threadSafeMatch(tagName, noscriptTag) && m_options.scriptEnabled))
1612 setState(HTMLTokenizer::RAWTEXTState);
1613 }
1614
temporaryBufferIs(const String & expectedString)1615 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1616 {
1617 return vectorEqualsString(m_temporaryBuffer, expectedString);
1618 }
1619
addToPossibleEndTag(LChar cc)1620 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
1621 {
1622 ASSERT(isEndTagBufferingState(m_state));
1623 m_bufferedEndTagName.append(cc);
1624 }
1625
isAppropriateEndTag()1626 inline bool HTMLTokenizer::isAppropriateEndTag()
1627 {
1628 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
1629 return false;
1630
1631 size_t numCharacters = m_bufferedEndTagName.size();
1632
1633 for (size_t i = 0; i < numCharacters; i++) {
1634 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
1635 return false;
1636 }
1637
1638 return true;
1639 }
1640
parseError()1641 inline void HTMLTokenizer::parseError()
1642 {
1643 notImplemented();
1644 }
1645
1646 }
1647