1'use strict'; 2 3var Preprocessor = require('./preprocessor'), 4 LocationInfoMixin = require('./location_info_mixin'), 5 UNICODE = require('../common/unicode'), 6 NAMED_ENTITY_TRIE = require('./named_entity_trie'); 7 8//Aliases 9var $ = UNICODE.CODE_POINTS, 10 $$ = UNICODE.CODE_POINT_SEQUENCES; 11 12//Replacement code points for numeric entities 13var NUMERIC_ENTITY_REPLACEMENTS = { 14 0x00: 0xFFFD, 0x0D: 0x000D, 0x80: 0x20AC, 0x81: 0x0081, 0x82: 0x201A, 0x83: 0x0192, 0x84: 0x201E, 15 0x85: 0x2026, 0x86: 0x2020, 0x87: 0x2021, 0x88: 0x02C6, 0x89: 0x2030, 0x8A: 0x0160, 0x8B: 0x2039, 16 0x8C: 0x0152, 0x8D: 0x008D, 0x8E: 0x017D, 0x8F: 0x008F, 0x90: 0x0090, 0x91: 0x2018, 0x92: 0x2019, 17 0x93: 0x201C, 0x94: 0x201D, 0x95: 0x2022, 0x96: 0x2013, 0x97: 0x2014, 0x98: 0x02DC, 0x99: 0x2122, 18 0x9A: 0x0161, 0x9B: 0x203A, 0x9C: 0x0153, 0x9D: 0x009D, 0x9E: 0x017E, 0x9F: 0x0178 19}; 20 21//States 22var DATA_STATE = 'DATA_STATE', 23 CHARACTER_REFERENCE_IN_DATA_STATE = 'CHARACTER_REFERENCE_IN_DATA_STATE', 24 RCDATA_STATE = 'RCDATA_STATE', 25 CHARACTER_REFERENCE_IN_RCDATA_STATE = 'CHARACTER_REFERENCE_IN_RCDATA_STATE', 26 RAWTEXT_STATE = 'RAWTEXT_STATE', 27 SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE', 28 PLAINTEXT_STATE = 'PLAINTEXT_STATE', 29 TAG_OPEN_STATE = 'TAG_OPEN_STATE', 30 END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE', 31 TAG_NAME_STATE = 'TAG_NAME_STATE', 32 RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE', 33 RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE', 34 RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE', 35 RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE', 36 RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE', 37 RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE', 38 SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE', 39 SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE', 40 SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE', 41 SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE', 42 SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE', 43 SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE', 44 SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE', 45 SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE', 46 SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE', 47 SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE', 48 SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE', 49 SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE', 50 SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE', 51 SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE', 52 SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE', 53 SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE', 54 SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE', 55 BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE', 56 ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE', 57 AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE', 58 BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE', 59 ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE', 60 ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE', 61 ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE', 62 CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE = 'CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE', 63 AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE', 64 SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE', 65 BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE', 66 MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE', 67 COMMENT_START_STATE = 'COMMENT_START_STATE', 68 COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE', 69 COMMENT_STATE = 'COMMENT_STATE', 70 COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE', 71 COMMENT_END_STATE = 'COMMENT_END_STATE', 72 COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE', 73 DOCTYPE_STATE = 'DOCTYPE_STATE', 74 BEFORE_DOCTYPE_NAME_STATE = 'BEFORE_DOCTYPE_NAME_STATE', 75 DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE', 76 AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE', 77 AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 'AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE', 78 BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE', 79 DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE', 80 DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE', 81 AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE', 82 BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE', 83 AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 'AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE', 84 BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE', 85 DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE', 86 DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE', 87 AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE', 88 BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE', 89 CDATA_SECTION_STATE = 'CDATA_SECTION_STATE'; 90 91//Utils 92 93//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline 94//this functions if they will be situated in another module due to context switch. 95//Always perform inlining check before modifying this functions ('node --trace-inlining'). 96function isWhitespace(cp) { 97 return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED; 98} 99 100function isAsciiDigit(cp) { 101 return cp >= $.DIGIT_0 && cp <= $.DIGIT_9; 102} 103 104function isAsciiUpper(cp) { 105 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z; 106} 107 108function isAsciiLower(cp) { 109 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z; 110} 111 112function isAsciiAlphaNumeric(cp) { 113 return isAsciiDigit(cp) || isAsciiUpper(cp) || isAsciiLower(cp); 114} 115 116function isDigit(cp, isHex) { 117 return isAsciiDigit(cp) || (isHex && ((cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F) || 118 (cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F))); 119} 120 121function isReservedCodePoint(cp) { 122 return cp >= 0xD800 && cp <= 0xDFFF || cp > 0x10FFFF; 123} 124 125function toAsciiLowerCodePoint(cp) { 126 return cp + 0x0020; 127} 128 129//NOTE: String.fromCharCode() function can handle only characters from BMP subset. 130//So, we need to workaround this manually. 131//(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values) 132function toChar(cp) { 133 if (cp <= 0xFFFF) 134 return String.fromCharCode(cp); 135 136 cp -= 0x10000; 137 return String.fromCharCode(cp >>> 10 & 0x3FF | 0xD800) + String.fromCharCode(0xDC00 | cp & 0x3FF); 138} 139 140function toAsciiLowerChar(cp) { 141 return String.fromCharCode(toAsciiLowerCodePoint(cp)); 142} 143 144//Tokenizer 145var Tokenizer = module.exports = function (html, options) { 146 this.disableEntitiesDecoding = false; 147 148 this.preprocessor = new Preprocessor(html); 149 150 this.tokenQueue = []; 151 152 this.allowCDATA = false; 153 154 this.state = DATA_STATE; 155 this.returnState = ''; 156 157 this.consumptionPos = 0; 158 159 this.tempBuff = []; 160 this.additionalAllowedCp = void 0; 161 this.lastStartTagName = ''; 162 163 this.currentCharacterToken = null; 164 this.currentToken = null; 165 this.currentAttr = null; 166 167 if (options) { 168 this.disableEntitiesDecoding = !options.decodeHtmlEntities; 169 170 if (options.locationInfo) 171 LocationInfoMixin.assign(this); 172 } 173}; 174 175//Token types 176Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN'; 177Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN'; 178Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN'; 179Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN'; 180Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN'; 181Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN'; 182Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN'; 183Tokenizer.EOF_TOKEN = 'EOF_TOKEN'; 184 185//Tokenizer initial states for different modes 186Tokenizer.MODE = Tokenizer.prototype.MODE = { 187 DATA: DATA_STATE, 188 RCDATA: RCDATA_STATE, 189 RAWTEXT: RAWTEXT_STATE, 190 SCRIPT_DATA: SCRIPT_DATA_STATE, 191 PLAINTEXT: PLAINTEXT_STATE 192}; 193 194//Static 195Tokenizer.getTokenAttr = function (token, attrName) { 196 for (var i = token.attrs.length - 1; i >= 0; i--) { 197 if (token.attrs[i].name === attrName) 198 return token.attrs[i].value; 199 } 200 201 return null; 202}; 203 204//Get token 205Tokenizer.prototype.getNextToken = function () { 206 while (!this.tokenQueue.length) 207 this[this.state](this._consume()); 208 209 return this.tokenQueue.shift(); 210}; 211 212//Consumption 213Tokenizer.prototype._consume = function () { 214 this.consumptionPos++; 215 return this.preprocessor.advanceAndPeekCodePoint(); 216}; 217 218Tokenizer.prototype._unconsume = function () { 219 this.consumptionPos--; 220 this.preprocessor.retreat(); 221}; 222 223Tokenizer.prototype._unconsumeSeveral = function (count) { 224 while (count--) 225 this._unconsume(); 226}; 227 228Tokenizer.prototype._reconsumeInState = function (state) { 229 this.state = state; 230 this._unconsume(); 231}; 232 233Tokenizer.prototype._consumeSubsequentIfMatch = function (pattern, startCp, caseSensitive) { 234 var rollbackPos = this.consumptionPos, 235 isMatch = true, 236 patternLength = pattern.length, 237 patternPos = 0, 238 cp = startCp, 239 patternCp = void 0; 240 241 for (; patternPos < patternLength; patternPos++) { 242 if (patternPos > 0) 243 cp = this._consume(); 244 245 if (cp === $.EOF) { 246 isMatch = false; 247 break; 248 } 249 250 patternCp = pattern[patternPos]; 251 252 if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) { 253 isMatch = false; 254 break; 255 } 256 } 257 258 if (!isMatch) 259 this._unconsumeSeveral(this.consumptionPos - rollbackPos); 260 261 return isMatch; 262}; 263 264//Lookahead 265Tokenizer.prototype._lookahead = function () { 266 var cp = this.preprocessor.advanceAndPeekCodePoint(); 267 this.preprocessor.retreat(); 268 269 return cp; 270}; 271 272//Temp buffer 273Tokenizer.prototype.isTempBufferEqualToScriptString = function () { 274 if (this.tempBuff.length !== $$.SCRIPT_STRING.length) 275 return false; 276 277 for (var i = 0; i < this.tempBuff.length; i++) { 278 if (this.tempBuff[i] !== $$.SCRIPT_STRING[i]) 279 return false; 280 } 281 282 return true; 283}; 284 285//Token creation 286Tokenizer.prototype.buildStartTagToken = function (tagName) { 287 return { 288 type: Tokenizer.START_TAG_TOKEN, 289 tagName: tagName, 290 selfClosing: false, 291 attrs: [] 292 }; 293}; 294 295Tokenizer.prototype.buildEndTagToken = function (tagName) { 296 return { 297 type: Tokenizer.END_TAG_TOKEN, 298 tagName: tagName, 299 ignored: false, 300 attrs: [] 301 }; 302}; 303 304Tokenizer.prototype._createStartTagToken = function (tagNameFirstCh) { 305 this.currentToken = this.buildStartTagToken(tagNameFirstCh); 306}; 307 308Tokenizer.prototype._createEndTagToken = function (tagNameFirstCh) { 309 this.currentToken = this.buildEndTagToken(tagNameFirstCh); 310}; 311 312Tokenizer.prototype._createCommentToken = function () { 313 this.currentToken = { 314 type: Tokenizer.COMMENT_TOKEN, 315 data: '' 316 }; 317}; 318 319Tokenizer.prototype._createDoctypeToken = function (doctypeNameFirstCh) { 320 this.currentToken = { 321 type: Tokenizer.DOCTYPE_TOKEN, 322 name: doctypeNameFirstCh || '', 323 forceQuirks: false, 324 publicId: null, 325 systemId: null 326 }; 327}; 328 329Tokenizer.prototype._createCharacterToken = function (type, ch) { 330 this.currentCharacterToken = { 331 type: type, 332 chars: ch 333 }; 334}; 335 336//Tag attributes 337Tokenizer.prototype._createAttr = function (attrNameFirstCh) { 338 this.currentAttr = { 339 name: attrNameFirstCh, 340 value: '' 341 }; 342}; 343 344Tokenizer.prototype._isDuplicateAttr = function () { 345 return Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) !== null; 346}; 347 348Tokenizer.prototype._leaveAttrName = function (toState) { 349 this.state = toState; 350 351 if (!this._isDuplicateAttr()) 352 this.currentToken.attrs.push(this.currentAttr); 353}; 354 355//Appropriate end tag token 356//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#appropriate-end-tag-token) 357Tokenizer.prototype._isAppropriateEndTagToken = function () { 358 return this.lastStartTagName === this.currentToken.tagName; 359}; 360 361//Token emission 362Tokenizer.prototype._emitCurrentToken = function () { 363 this._emitCurrentCharacterToken(); 364 365 //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate. 366 if (this.currentToken.type === Tokenizer.START_TAG_TOKEN) 367 this.lastStartTagName = this.currentToken.tagName; 368 369 this.tokenQueue.push(this.currentToken); 370 this.currentToken = null; 371}; 372 373Tokenizer.prototype._emitCurrentCharacterToken = function () { 374 if (this.currentCharacterToken) { 375 this.tokenQueue.push(this.currentCharacterToken); 376 this.currentCharacterToken = null; 377 } 378}; 379 380Tokenizer.prototype._emitEOFToken = function () { 381 this._emitCurrentCharacterToken(); 382 this.tokenQueue.push({type: Tokenizer.EOF_TOKEN}); 383}; 384 385//Characters emission 386 387//OPTIMIZATION: specification uses only one type of character tokens (one token per character). 388//This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters. 389//If we have a sequence of characters that belong to the same group, parser can process it 390//as a single solid character token. 391//So, there are 3 types of character tokens in parse5: 392//1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000') 393//2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n \r\t \f') 394//3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') 395Tokenizer.prototype._appendCharToCurrentCharacterToken = function (type, ch) { 396 if (this.currentCharacterToken && this.currentCharacterToken.type !== type) 397 this._emitCurrentCharacterToken(); 398 399 if (this.currentCharacterToken) 400 this.currentCharacterToken.chars += ch; 401 402 else 403 this._createCharacterToken(type, ch); 404}; 405 406Tokenizer.prototype._emitCodePoint = function (cp) { 407 var type = Tokenizer.CHARACTER_TOKEN; 408 409 if (isWhitespace(cp)) 410 type = Tokenizer.WHITESPACE_CHARACTER_TOKEN; 411 412 else if (cp === $.NULL) 413 type = Tokenizer.NULL_CHARACTER_TOKEN; 414 415 this._appendCharToCurrentCharacterToken(type, toChar(cp)); 416}; 417 418Tokenizer.prototype._emitSeveralCodePoints = function (codePoints) { 419 for (var i = 0; i < codePoints.length; i++) 420 this._emitCodePoint(codePoints[i]); 421}; 422 423//NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character. 424//So we can avoid additional checks here. 425Tokenizer.prototype._emitChar = function (ch) { 426 this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch); 427}; 428 429//Character reference tokenization 430Tokenizer.prototype._consumeNumericEntity = function (isHex) { 431 var digits = '', 432 nextCp = void 0; 433 434 do { 435 digits += toChar(this._consume()); 436 nextCp = this._lookahead(); 437 } while (nextCp !== $.EOF && isDigit(nextCp, isHex)); 438 439 if (this._lookahead() === $.SEMICOLON) 440 this._consume(); 441 442 var referencedCp = parseInt(digits, isHex ? 16 : 10), 443 replacement = NUMERIC_ENTITY_REPLACEMENTS[referencedCp]; 444 445 if (replacement) 446 return replacement; 447 448 if (isReservedCodePoint(referencedCp)) 449 return $.REPLACEMENT_CHARACTER; 450 451 return referencedCp; 452}; 453 454Tokenizer.prototype._consumeNamedEntity = function (startCp, inAttr) { 455 var referencedCodePoints = null, 456 entityCodePointsCount = 0, 457 cp = startCp, 458 leaf = NAMED_ENTITY_TRIE[cp], 459 consumedCount = 1, 460 semicolonTerminated = false; 461 462 for (; leaf && cp !== $.EOF; cp = this._consume(), consumedCount++, leaf = leaf.l && leaf.l[cp]) { 463 if (leaf.c) { 464 //NOTE: we have at least one named reference match. But we don't stop lookup at this point, 465 //because longer matches still can be found (e.g. '¬' and '∉') except the case 466 //then found match is terminated by semicolon. 467 referencedCodePoints = leaf.c; 468 entityCodePointsCount = consumedCount; 469 470 if (cp === $.SEMICOLON) { 471 semicolonTerminated = true; 472 break; 473 } 474 } 475 } 476 477 if (referencedCodePoints) { 478 if (!semicolonTerminated) { 479 //NOTE: unconsume excess (e.g. 'it' in '¬it') 480 this._unconsumeSeveral(consumedCount - entityCodePointsCount); 481 482 //NOTE: If the character reference is being consumed as part of an attribute and the next character 483 //is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII character, then, for historical 484 //reasons, all the characters that were matched after the U+0026 AMPERSAND character (&) must be 485 //unconsumed, and nothing is returned. 486 //However, if this next character is in fact a U+003D EQUALS SIGN character (=), then this is a 487 //parse error, because some legacy user agents will misinterpret the markup in those cases. 488 //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references) 489 if (inAttr) { 490 var nextCp = this._lookahead(); 491 492 if (nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp)) { 493 this._unconsumeSeveral(entityCodePointsCount); 494 return null; 495 } 496 } 497 } 498 499 return referencedCodePoints; 500 } 501 502 this._unconsumeSeveral(consumedCount); 503 504 return null; 505}; 506 507Tokenizer.prototype._consumeCharacterReference = function (startCp, inAttr) { 508 if (this.disableEntitiesDecoding || isWhitespace(startCp) || startCp === $.GREATER_THAN_SIGN || 509 startCp === $.AMPERSAND || startCp === this.additionalAllowedCp || startCp === $.EOF) { 510 //NOTE: not a character reference. No characters are consumed, and nothing is returned. 511 this._unconsume(); 512 return null; 513 } 514 515 else if (startCp === $.NUMBER_SIGN) { 516 //NOTE: we have a numeric entity candidate, now we should determine if it's hex or decimal 517 var isHex = false, 518 nextCp = this._lookahead(); 519 520 if (nextCp === $.LATIN_SMALL_X || nextCp === $.LATIN_CAPITAL_X) { 521 this._consume(); 522 isHex = true; 523 } 524 525 nextCp = this._lookahead(); 526 527 //NOTE: if we have at least one digit this is a numeric entity for sure, so we consume it 528 if (nextCp !== $.EOF && isDigit(nextCp, isHex)) 529 return [this._consumeNumericEntity(isHex)]; 530 531 else { 532 //NOTE: otherwise this is a bogus number entity and a parse error. Unconsume the number sign 533 //and the 'x'-character if appropriate. 534 this._unconsumeSeveral(isHex ? 2 : 1); 535 return null; 536 } 537 } 538 539 else 540 return this._consumeNamedEntity(startCp, inAttr); 541}; 542 543//State machine 544var _ = Tokenizer.prototype; 545 546//12.2.4.1 Data state 547//------------------------------------------------------------------ 548_[DATA_STATE] = function dataState(cp) { 549 if (cp === $.AMPERSAND) 550 this.state = CHARACTER_REFERENCE_IN_DATA_STATE; 551 552 else if (cp === $.LESS_THAN_SIGN) 553 this.state = TAG_OPEN_STATE; 554 555 else if (cp === $.NULL) 556 this._emitCodePoint(cp); 557 558 else if (cp === $.EOF) 559 this._emitEOFToken(); 560 561 else 562 this._emitCodePoint(cp); 563}; 564 565 566//12.2.4.2 Character reference in data state 567//------------------------------------------------------------------ 568_[CHARACTER_REFERENCE_IN_DATA_STATE] = function characterReferenceInDataState(cp) { 569 this.state = DATA_STATE; 570 this.additionalAllowedCp = void 0; 571 572 var referencedCodePoints = this._consumeCharacterReference(cp, false); 573 574 if (referencedCodePoints) 575 this._emitSeveralCodePoints(referencedCodePoints); 576 else 577 this._emitChar('&'); 578}; 579 580 581//12.2.4.3 RCDATA state 582//------------------------------------------------------------------ 583_[RCDATA_STATE] = function rcdataState(cp) { 584 if (cp === $.AMPERSAND) 585 this.state = CHARACTER_REFERENCE_IN_RCDATA_STATE; 586 587 else if (cp === $.LESS_THAN_SIGN) 588 this.state = RCDATA_LESS_THAN_SIGN_STATE; 589 590 else if (cp === $.NULL) 591 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 592 593 else if (cp === $.EOF) 594 this._emitEOFToken(); 595 596 else 597 this._emitCodePoint(cp); 598}; 599 600 601//12.2.4.4 Character reference in RCDATA state 602//------------------------------------------------------------------ 603_[CHARACTER_REFERENCE_IN_RCDATA_STATE] = function characterReferenceInRcdataState(cp) { 604 this.state = RCDATA_STATE; 605 this.additionalAllowedCp = void 0; 606 607 var referencedCodePoints = this._consumeCharacterReference(cp, false); 608 609 if (referencedCodePoints) 610 this._emitSeveralCodePoints(referencedCodePoints); 611 else 612 this._emitChar('&'); 613}; 614 615 616//12.2.4.5 RAWTEXT state 617//------------------------------------------------------------------ 618_[RAWTEXT_STATE] = function rawtextState(cp) { 619 if (cp === $.LESS_THAN_SIGN) 620 this.state = RAWTEXT_LESS_THAN_SIGN_STATE; 621 622 else if (cp === $.NULL) 623 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 624 625 else if (cp === $.EOF) 626 this._emitEOFToken(); 627 628 else 629 this._emitCodePoint(cp); 630}; 631 632 633//12.2.4.6 Script data state 634//------------------------------------------------------------------ 635_[SCRIPT_DATA_STATE] = function scriptDataState(cp) { 636 if (cp === $.LESS_THAN_SIGN) 637 this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE; 638 639 else if (cp === $.NULL) 640 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 641 642 else if (cp === $.EOF) 643 this._emitEOFToken(); 644 645 else 646 this._emitCodePoint(cp); 647}; 648 649 650//12.2.4.7 PLAINTEXT state 651//------------------------------------------------------------------ 652_[PLAINTEXT_STATE] = function plaintextState(cp) { 653 if (cp === $.NULL) 654 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 655 656 else if (cp === $.EOF) 657 this._emitEOFToken(); 658 659 else 660 this._emitCodePoint(cp); 661}; 662 663 664//12.2.4.8 Tag open state 665//------------------------------------------------------------------ 666_[TAG_OPEN_STATE] = function tagOpenState(cp) { 667 if (cp === $.EXCLAMATION_MARK) 668 this.state = MARKUP_DECLARATION_OPEN_STATE; 669 670 else if (cp === $.SOLIDUS) 671 this.state = END_TAG_OPEN_STATE; 672 673 else if (isAsciiUpper(cp)) { 674 this._createStartTagToken(toAsciiLowerChar(cp)); 675 this.state = TAG_NAME_STATE; 676 } 677 678 else if (isAsciiLower(cp)) { 679 this._createStartTagToken(toChar(cp)); 680 this.state = TAG_NAME_STATE; 681 } 682 683 else if (cp === $.QUESTION_MARK) { 684 //NOTE: call bogus comment state directly with current consumed character to avoid unnecessary reconsumption. 685 this[BOGUS_COMMENT_STATE](cp); 686 } 687 688 else { 689 this._emitChar('<'); 690 this._reconsumeInState(DATA_STATE); 691 } 692}; 693 694 695//12.2.4.9 End tag open state 696//------------------------------------------------------------------ 697_[END_TAG_OPEN_STATE] = function endTagOpenState(cp) { 698 if (isAsciiUpper(cp)) { 699 this._createEndTagToken(toAsciiLowerChar(cp)); 700 this.state = TAG_NAME_STATE; 701 } 702 703 else if (isAsciiLower(cp)) { 704 this._createEndTagToken(toChar(cp)); 705 this.state = TAG_NAME_STATE; 706 } 707 708 else if (cp === $.GREATER_THAN_SIGN) 709 this.state = DATA_STATE; 710 711 else if (cp === $.EOF) { 712 this._reconsumeInState(DATA_STATE); 713 this._emitChar('<'); 714 this._emitChar('/'); 715 } 716 717 else { 718 //NOTE: call bogus comment state directly with current consumed character to avoid unnecessary reconsumption. 719 this[BOGUS_COMMENT_STATE](cp); 720 } 721}; 722 723 724//12.2.4.10 Tag name state 725//------------------------------------------------------------------ 726_[TAG_NAME_STATE] = function tagNameState(cp) { 727 if (isWhitespace(cp)) 728 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 729 730 else if (cp === $.SOLIDUS) 731 this.state = SELF_CLOSING_START_TAG_STATE; 732 733 else if (cp === $.GREATER_THAN_SIGN) { 734 this.state = DATA_STATE; 735 this._emitCurrentToken(); 736 } 737 738 else if (isAsciiUpper(cp)) 739 this.currentToken.tagName += toAsciiLowerChar(cp); 740 741 else if (cp === $.NULL) 742 this.currentToken.tagName += UNICODE.REPLACEMENT_CHARACTER; 743 744 else if (cp === $.EOF) 745 this._reconsumeInState(DATA_STATE); 746 747 else 748 this.currentToken.tagName += toChar(cp); 749}; 750 751 752//12.2.4.11 RCDATA less-than sign state 753//------------------------------------------------------------------ 754_[RCDATA_LESS_THAN_SIGN_STATE] = function rcdataLessThanSignState(cp) { 755 if (cp === $.SOLIDUS) { 756 this.tempBuff = []; 757 this.state = RCDATA_END_TAG_OPEN_STATE; 758 } 759 760 else { 761 this._emitChar('<'); 762 this._reconsumeInState(RCDATA_STATE); 763 } 764}; 765 766 767//12.2.4.12 RCDATA end tag open state 768//------------------------------------------------------------------ 769_[RCDATA_END_TAG_OPEN_STATE] = function rcdataEndTagOpenState(cp) { 770 if (isAsciiUpper(cp)) { 771 this._createEndTagToken(toAsciiLowerChar(cp)); 772 this.tempBuff.push(cp); 773 this.state = RCDATA_END_TAG_NAME_STATE; 774 } 775 776 else if (isAsciiLower(cp)) { 777 this._createEndTagToken(toChar(cp)); 778 this.tempBuff.push(cp); 779 this.state = RCDATA_END_TAG_NAME_STATE; 780 } 781 782 else { 783 this._emitChar('<'); 784 this._emitChar('/'); 785 this._reconsumeInState(RCDATA_STATE); 786 } 787}; 788 789 790//12.2.4.13 RCDATA end tag name state 791//------------------------------------------------------------------ 792_[RCDATA_END_TAG_NAME_STATE] = function rcdataEndTagNameState(cp) { 793 if (isAsciiUpper(cp)) { 794 this.currentToken.tagName += toAsciiLowerChar(cp); 795 this.tempBuff.push(cp); 796 } 797 798 else if (isAsciiLower(cp)) { 799 this.currentToken.tagName += toChar(cp); 800 this.tempBuff.push(cp); 801 } 802 803 else { 804 if (this._isAppropriateEndTagToken()) { 805 if (isWhitespace(cp)) { 806 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 807 return; 808 } 809 810 if (cp === $.SOLIDUS) { 811 this.state = SELF_CLOSING_START_TAG_STATE; 812 return; 813 } 814 815 if (cp === $.GREATER_THAN_SIGN) { 816 this.state = DATA_STATE; 817 this._emitCurrentToken(); 818 return; 819 } 820 } 821 822 this._emitChar('<'); 823 this._emitChar('/'); 824 this._emitSeveralCodePoints(this.tempBuff); 825 this._reconsumeInState(RCDATA_STATE); 826 } 827}; 828 829 830//12.2.4.14 RAWTEXT less-than sign state 831//------------------------------------------------------------------ 832_[RAWTEXT_LESS_THAN_SIGN_STATE] = function rawtextLessThanSignState(cp) { 833 if (cp === $.SOLIDUS) { 834 this.tempBuff = []; 835 this.state = RAWTEXT_END_TAG_OPEN_STATE; 836 } 837 838 else { 839 this._emitChar('<'); 840 this._reconsumeInState(RAWTEXT_STATE); 841 } 842}; 843 844 845//12.2.4.15 RAWTEXT end tag open state 846//------------------------------------------------------------------ 847_[RAWTEXT_END_TAG_OPEN_STATE] = function rawtextEndTagOpenState(cp) { 848 if (isAsciiUpper(cp)) { 849 this._createEndTagToken(toAsciiLowerChar(cp)); 850 this.tempBuff.push(cp); 851 this.state = RAWTEXT_END_TAG_NAME_STATE; 852 } 853 854 else if (isAsciiLower(cp)) { 855 this._createEndTagToken(toChar(cp)); 856 this.tempBuff.push(cp); 857 this.state = RAWTEXT_END_TAG_NAME_STATE; 858 } 859 860 else { 861 this._emitChar('<'); 862 this._emitChar('/'); 863 this._reconsumeInState(RAWTEXT_STATE); 864 } 865}; 866 867 868//12.2.4.16 RAWTEXT end tag name state 869//------------------------------------------------------------------ 870_[RAWTEXT_END_TAG_NAME_STATE] = function rawtextEndTagNameState(cp) { 871 if (isAsciiUpper(cp)) { 872 this.currentToken.tagName += toAsciiLowerChar(cp); 873 this.tempBuff.push(cp); 874 } 875 876 else if (isAsciiLower(cp)) { 877 this.currentToken.tagName += toChar(cp); 878 this.tempBuff.push(cp); 879 } 880 881 else { 882 if (this._isAppropriateEndTagToken()) { 883 if (isWhitespace(cp)) { 884 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 885 return; 886 } 887 888 if (cp === $.SOLIDUS) { 889 this.state = SELF_CLOSING_START_TAG_STATE; 890 return; 891 } 892 893 if (cp === $.GREATER_THAN_SIGN) { 894 this._emitCurrentToken(); 895 this.state = DATA_STATE; 896 return; 897 } 898 } 899 900 this._emitChar('<'); 901 this._emitChar('/'); 902 this._emitSeveralCodePoints(this.tempBuff); 903 this._reconsumeInState(RAWTEXT_STATE); 904 } 905}; 906 907 908//12.2.4.17 Script data less-than sign state 909//------------------------------------------------------------------ 910_[SCRIPT_DATA_LESS_THAN_SIGN_STATE] = function scriptDataLessThanSignState(cp) { 911 if (cp === $.SOLIDUS) { 912 this.tempBuff = []; 913 this.state = SCRIPT_DATA_END_TAG_OPEN_STATE; 914 } 915 916 else if (cp === $.EXCLAMATION_MARK) { 917 this.state = SCRIPT_DATA_ESCAPE_START_STATE; 918 this._emitChar('<'); 919 this._emitChar('!'); 920 } 921 922 else { 923 this._emitChar('<'); 924 this._reconsumeInState(SCRIPT_DATA_STATE); 925 } 926}; 927 928 929//12.2.4.18 Script data end tag open state 930//------------------------------------------------------------------ 931_[SCRIPT_DATA_END_TAG_OPEN_STATE] = function scriptDataEndTagOpenState(cp) { 932 if (isAsciiUpper(cp)) { 933 this._createEndTagToken(toAsciiLowerChar(cp)); 934 this.tempBuff.push(cp); 935 this.state = SCRIPT_DATA_END_TAG_NAME_STATE; 936 } 937 938 else if (isAsciiLower(cp)) { 939 this._createEndTagToken(toChar(cp)); 940 this.tempBuff.push(cp); 941 this.state = SCRIPT_DATA_END_TAG_NAME_STATE; 942 } 943 944 else { 945 this._emitChar('<'); 946 this._emitChar('/'); 947 this._reconsumeInState(SCRIPT_DATA_STATE); 948 } 949}; 950 951 952//12.2.4.19 Script data end tag name state 953//------------------------------------------------------------------ 954_[SCRIPT_DATA_END_TAG_NAME_STATE] = function scriptDataEndTagNameState(cp) { 955 if (isAsciiUpper(cp)) { 956 this.currentToken.tagName += toAsciiLowerChar(cp); 957 this.tempBuff.push(cp); 958 } 959 960 else if (isAsciiLower(cp)) { 961 this.currentToken.tagName += toChar(cp); 962 this.tempBuff.push(cp); 963 } 964 965 else { 966 if (this._isAppropriateEndTagToken()) { 967 if (isWhitespace(cp)) { 968 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 969 return; 970 } 971 972 else if (cp === $.SOLIDUS) { 973 this.state = SELF_CLOSING_START_TAG_STATE; 974 return; 975 } 976 977 else if (cp === $.GREATER_THAN_SIGN) { 978 this._emitCurrentToken(); 979 this.state = DATA_STATE; 980 return; 981 } 982 } 983 984 this._emitChar('<'); 985 this._emitChar('/'); 986 this._emitSeveralCodePoints(this.tempBuff); 987 this._reconsumeInState(SCRIPT_DATA_STATE); 988 } 989}; 990 991 992//12.2.4.20 Script data escape start state 993//------------------------------------------------------------------ 994_[SCRIPT_DATA_ESCAPE_START_STATE] = function scriptDataEscapeStartState(cp) { 995 if (cp === $.HYPHEN_MINUS) { 996 this.state = SCRIPT_DATA_ESCAPE_START_DASH_STATE; 997 this._emitChar('-'); 998 } 999 1000 else 1001 this._reconsumeInState(SCRIPT_DATA_STATE); 1002}; 1003 1004 1005//12.2.4.21 Script data escape start dash state 1006//------------------------------------------------------------------ 1007_[SCRIPT_DATA_ESCAPE_START_DASH_STATE] = function scriptDataEscapeStartDashState(cp) { 1008 if (cp === $.HYPHEN_MINUS) { 1009 this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE; 1010 this._emitChar('-'); 1011 } 1012 1013 else 1014 this._reconsumeInState(SCRIPT_DATA_STATE); 1015}; 1016 1017 1018//12.2.4.22 Script data escaped state 1019//------------------------------------------------------------------ 1020_[SCRIPT_DATA_ESCAPED_STATE] = function scriptDataEscapedState(cp) { 1021 if (cp === $.HYPHEN_MINUS) { 1022 this.state = SCRIPT_DATA_ESCAPED_DASH_STATE; 1023 this._emitChar('-'); 1024 } 1025 1026 else if (cp === $.LESS_THAN_SIGN) 1027 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; 1028 1029 else if (cp === $.NULL) 1030 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 1031 1032 else if (cp === $.EOF) 1033 this._reconsumeInState(DATA_STATE); 1034 1035 else 1036 this._emitCodePoint(cp); 1037}; 1038 1039 1040//12.2.4.23 Script data escaped dash state 1041//------------------------------------------------------------------ 1042_[SCRIPT_DATA_ESCAPED_DASH_STATE] = function scriptDataEscapedDashState(cp) { 1043 if (cp === $.HYPHEN_MINUS) { 1044 this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE; 1045 this._emitChar('-'); 1046 } 1047 1048 else if (cp === $.LESS_THAN_SIGN) 1049 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; 1050 1051 else if (cp === $.NULL) { 1052 this.state = SCRIPT_DATA_ESCAPED_STATE; 1053 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 1054 } 1055 1056 else if (cp === $.EOF) 1057 this._reconsumeInState(DATA_STATE); 1058 1059 else { 1060 this.state = SCRIPT_DATA_ESCAPED_STATE; 1061 this._emitCodePoint(cp); 1062 } 1063}; 1064 1065 1066//12.2.4.24 Script data escaped dash dash state 1067//------------------------------------------------------------------ 1068_[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE] = function scriptDataEscapedDashDashState(cp) { 1069 if (cp === $.HYPHEN_MINUS) 1070 this._emitChar('-'); 1071 1072 else if (cp === $.LESS_THAN_SIGN) 1073 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; 1074 1075 else if (cp === $.GREATER_THAN_SIGN) { 1076 this.state = SCRIPT_DATA_STATE; 1077 this._emitChar('>'); 1078 } 1079 1080 else if (cp === $.NULL) { 1081 this.state = SCRIPT_DATA_ESCAPED_STATE; 1082 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 1083 } 1084 1085 else if (cp === $.EOF) 1086 this._reconsumeInState(DATA_STATE); 1087 1088 else { 1089 this.state = SCRIPT_DATA_ESCAPED_STATE; 1090 this._emitCodePoint(cp); 1091 } 1092}; 1093 1094 1095//12.2.4.25 Script data escaped less-than sign state 1096//------------------------------------------------------------------ 1097_[SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataEscapedLessThanSignState(cp) { 1098 if (cp === $.SOLIDUS) { 1099 this.tempBuff = []; 1100 this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE; 1101 } 1102 1103 else if (isAsciiUpper(cp)) { 1104 this.tempBuff = []; 1105 this.tempBuff.push(toAsciiLowerCodePoint(cp)); 1106 this.state = SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE; 1107 this._emitChar('<'); 1108 this._emitCodePoint(cp); 1109 } 1110 1111 else if (isAsciiLower(cp)) { 1112 this.tempBuff = []; 1113 this.tempBuff.push(cp); 1114 this.state = SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE; 1115 this._emitChar('<'); 1116 this._emitCodePoint(cp); 1117 } 1118 1119 else { 1120 this._emitChar('<'); 1121 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1122 } 1123}; 1124 1125 1126//12.2.4.26 Script data escaped end tag open state 1127//------------------------------------------------------------------ 1128_[SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE] = function scriptDataEscapedEndTagOpenState(cp) { 1129 if (isAsciiUpper(cp)) { 1130 this._createEndTagToken(toAsciiLowerChar(cp)); 1131 this.tempBuff.push(cp); 1132 this.state = SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE; 1133 } 1134 1135 else if (isAsciiLower(cp)) { 1136 this._createEndTagToken(toChar(cp)); 1137 this.tempBuff.push(cp); 1138 this.state = SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE; 1139 } 1140 1141 else { 1142 this._emitChar('<'); 1143 this._emitChar('/'); 1144 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1145 } 1146}; 1147 1148 1149//12.2.4.27 Script data escaped end tag name state 1150//------------------------------------------------------------------ 1151_[SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE] = function scriptDataEscapedEndTagNameState(cp) { 1152 if (isAsciiUpper(cp)) { 1153 this.currentToken.tagName += toAsciiLowerChar(cp); 1154 this.tempBuff.push(cp); 1155 } 1156 1157 else if (isAsciiLower(cp)) { 1158 this.currentToken.tagName += toChar(cp); 1159 this.tempBuff.push(cp); 1160 } 1161 1162 else { 1163 if (this._isAppropriateEndTagToken()) { 1164 if (isWhitespace(cp)) { 1165 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 1166 return; 1167 } 1168 1169 if (cp === $.SOLIDUS) { 1170 this.state = SELF_CLOSING_START_TAG_STATE; 1171 return; 1172 } 1173 1174 if (cp === $.GREATER_THAN_SIGN) { 1175 this._emitCurrentToken(); 1176 this.state = DATA_STATE; 1177 return; 1178 } 1179 } 1180 1181 this._emitChar('<'); 1182 this._emitChar('/'); 1183 this._emitSeveralCodePoints(this.tempBuff); 1184 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1185 } 1186}; 1187 1188 1189//12.2.4.28 Script data double escape start state 1190//------------------------------------------------------------------ 1191_[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE] = function scriptDataDoubleEscapeStartState(cp) { 1192 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) { 1193 this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE : SCRIPT_DATA_ESCAPED_STATE; 1194 this._emitCodePoint(cp); 1195 } 1196 1197 else if (isAsciiUpper(cp)) { 1198 this.tempBuff.push(toAsciiLowerCodePoint(cp)); 1199 this._emitCodePoint(cp); 1200 } 1201 1202 else if (isAsciiLower(cp)) { 1203 this.tempBuff.push(cp); 1204 this._emitCodePoint(cp); 1205 } 1206 1207 else 1208 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1209}; 1210 1211 1212//12.2.4.29 Script data double escaped state 1213//------------------------------------------------------------------ 1214_[SCRIPT_DATA_DOUBLE_ESCAPED_STATE] = function scriptDataDoubleEscapedState(cp) { 1215 if (cp === $.HYPHEN_MINUS) { 1216 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE; 1217 this._emitChar('-'); 1218 } 1219 1220 else if (cp === $.LESS_THAN_SIGN) { 1221 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE; 1222 this._emitChar('<'); 1223 } 1224 1225 else if (cp === $.NULL) 1226 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 1227 1228 else if (cp === $.EOF) 1229 this._reconsumeInState(DATA_STATE); 1230 1231 else 1232 this._emitCodePoint(cp); 1233}; 1234 1235 1236//12.2.4.30 Script data double escaped dash state 1237//------------------------------------------------------------------ 1238_[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE] = function scriptDataDoubleEscapedDashState(cp) { 1239 if (cp === $.HYPHEN_MINUS) { 1240 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE; 1241 this._emitChar('-'); 1242 } 1243 1244 else if (cp === $.LESS_THAN_SIGN) { 1245 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE; 1246 this._emitChar('<'); 1247 } 1248 1249 else if (cp === $.NULL) { 1250 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1251 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 1252 } 1253 1254 else if (cp === $.EOF) 1255 this._reconsumeInState(DATA_STATE); 1256 1257 else { 1258 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1259 this._emitCodePoint(cp); 1260 } 1261}; 1262 1263 1264//12.2.4.31 Script data double escaped dash dash state 1265//------------------------------------------------------------------ 1266_[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE] = function scriptDataDoubleEscapedDashDashState(cp) { 1267 if (cp === $.HYPHEN_MINUS) 1268 this._emitChar('-'); 1269 1270 else if (cp === $.LESS_THAN_SIGN) { 1271 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE; 1272 this._emitChar('<'); 1273 } 1274 1275 else if (cp === $.GREATER_THAN_SIGN) { 1276 this.state = SCRIPT_DATA_STATE; 1277 this._emitChar('>'); 1278 } 1279 1280 else if (cp === $.NULL) { 1281 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1282 this._emitChar(UNICODE.REPLACEMENT_CHARACTER); 1283 } 1284 1285 else if (cp === $.EOF) 1286 this._reconsumeInState(DATA_STATE); 1287 1288 else { 1289 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1290 this._emitCodePoint(cp); 1291 } 1292}; 1293 1294 1295//12.2.4.32 Script data double escaped less-than sign state 1296//------------------------------------------------------------------ 1297_[SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataDoubleEscapedLessThanSignState(cp) { 1298 if (cp === $.SOLIDUS) { 1299 this.tempBuff = []; 1300 this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE; 1301 this._emitChar('/'); 1302 } 1303 1304 else 1305 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE); 1306}; 1307 1308 1309//12.2.4.33 Script data double escape end state 1310//------------------------------------------------------------------ 1311_[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE] = function scriptDataDoubleEscapeEndState(cp) { 1312 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) { 1313 this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_ESCAPED_STATE : SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1314 1315 this._emitCodePoint(cp); 1316 } 1317 1318 else if (isAsciiUpper(cp)) { 1319 this.tempBuff.push(toAsciiLowerCodePoint(cp)); 1320 this._emitCodePoint(cp); 1321 } 1322 1323 else if (isAsciiLower(cp)) { 1324 this.tempBuff.push(cp); 1325 this._emitCodePoint(cp); 1326 } 1327 1328 else 1329 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE); 1330}; 1331 1332 1333//12.2.4.34 Before attribute name state 1334//------------------------------------------------------------------ 1335_[BEFORE_ATTRIBUTE_NAME_STATE] = function beforeAttributeNameState(cp) { 1336 if (isWhitespace(cp)) 1337 return; 1338 1339 if (cp === $.SOLIDUS) 1340 this.state = SELF_CLOSING_START_TAG_STATE; 1341 1342 else if (cp === $.GREATER_THAN_SIGN) { 1343 this.state = DATA_STATE; 1344 this._emitCurrentToken(); 1345 } 1346 1347 else if (isAsciiUpper(cp)) { 1348 this._createAttr(toAsciiLowerChar(cp)); 1349 this.state = ATTRIBUTE_NAME_STATE; 1350 } 1351 1352 else if (cp === $.NULL) { 1353 this._createAttr(UNICODE.REPLACEMENT_CHARACTER); 1354 this.state = ATTRIBUTE_NAME_STATE; 1355 } 1356 1357 else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN || cp === $.EQUALS_SIGN) { 1358 this._createAttr(toChar(cp)); 1359 this.state = ATTRIBUTE_NAME_STATE; 1360 } 1361 1362 else if (cp === $.EOF) 1363 this._reconsumeInState(DATA_STATE); 1364 1365 else { 1366 this._createAttr(toChar(cp)); 1367 this.state = ATTRIBUTE_NAME_STATE; 1368 } 1369}; 1370 1371 1372//12.2.4.35 Attribute name state 1373//------------------------------------------------------------------ 1374_[ATTRIBUTE_NAME_STATE] = function attributeNameState(cp) { 1375 if (isWhitespace(cp)) 1376 this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE); 1377 1378 else if (cp === $.SOLIDUS) 1379 this._leaveAttrName(SELF_CLOSING_START_TAG_STATE); 1380 1381 else if (cp === $.EQUALS_SIGN) 1382 this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE); 1383 1384 else if (cp === $.GREATER_THAN_SIGN) { 1385 this._leaveAttrName(DATA_STATE); 1386 this._emitCurrentToken(); 1387 } 1388 1389 else if (isAsciiUpper(cp)) 1390 this.currentAttr.name += toAsciiLowerChar(cp); 1391 1392 else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN) 1393 this.currentAttr.name += toChar(cp); 1394 1395 else if (cp === $.NULL) 1396 this.currentAttr.name += UNICODE.REPLACEMENT_CHARACTER; 1397 1398 else if (cp === $.EOF) 1399 this._reconsumeInState(DATA_STATE); 1400 1401 else 1402 this.currentAttr.name += toChar(cp); 1403}; 1404 1405 1406//12.2.4.36 After attribute name state 1407//------------------------------------------------------------------ 1408_[AFTER_ATTRIBUTE_NAME_STATE] = function afterAttributeNameState(cp) { 1409 if (isWhitespace(cp)) 1410 return; 1411 1412 if (cp === $.SOLIDUS) 1413 this.state = SELF_CLOSING_START_TAG_STATE; 1414 1415 else if (cp === $.EQUALS_SIGN) 1416 this.state = BEFORE_ATTRIBUTE_VALUE_STATE; 1417 1418 else if (cp === $.GREATER_THAN_SIGN) { 1419 this.state = DATA_STATE; 1420 this._emitCurrentToken(); 1421 } 1422 1423 else if (isAsciiUpper(cp)) { 1424 this._createAttr(toAsciiLowerChar(cp)); 1425 this.state = ATTRIBUTE_NAME_STATE; 1426 } 1427 1428 else if (cp === $.NULL) { 1429 this._createAttr(UNICODE.REPLACEMENT_CHARACTER); 1430 this.state = ATTRIBUTE_NAME_STATE; 1431 } 1432 1433 else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN) { 1434 this._createAttr(toChar(cp)); 1435 this.state = ATTRIBUTE_NAME_STATE; 1436 } 1437 1438 else if (cp === $.EOF) 1439 this._reconsumeInState(DATA_STATE); 1440 1441 else { 1442 this._createAttr(toChar(cp)); 1443 this.state = ATTRIBUTE_NAME_STATE; 1444 } 1445}; 1446 1447 1448//12.2.4.37 Before attribute value state 1449//------------------------------------------------------------------ 1450_[BEFORE_ATTRIBUTE_VALUE_STATE] = function beforeAttributeValueState(cp) { 1451 if (isWhitespace(cp)) 1452 return; 1453 1454 if (cp === $.QUOTATION_MARK) 1455 this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 1456 1457 else if (cp === $.AMPERSAND) 1458 this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE); 1459 1460 else if (cp === $.APOSTROPHE) 1461 this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 1462 1463 else if (cp === $.NULL) { 1464 this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER; 1465 this.state = ATTRIBUTE_VALUE_UNQUOTED_STATE; 1466 } 1467 1468 else if (cp === $.GREATER_THAN_SIGN) { 1469 this.state = DATA_STATE; 1470 this._emitCurrentToken(); 1471 } 1472 1473 else if (cp === $.LESS_THAN_SIGN || cp === $.EQUALS_SIGN || cp === $.GRAVE_ACCENT) { 1474 this.currentAttr.value += toChar(cp); 1475 this.state = ATTRIBUTE_VALUE_UNQUOTED_STATE; 1476 } 1477 1478 else if (cp === $.EOF) 1479 this._reconsumeInState(DATA_STATE); 1480 1481 else { 1482 this.currentAttr.value += toChar(cp); 1483 this.state = ATTRIBUTE_VALUE_UNQUOTED_STATE; 1484 } 1485}; 1486 1487 1488//12.2.4.38 Attribute value (double-quoted) state 1489//------------------------------------------------------------------ 1490_[ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE] = function attributeValueDoubleQuotedState(cp) { 1491 if (cp === $.QUOTATION_MARK) 1492 this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; 1493 1494 else if (cp === $.AMPERSAND) { 1495 this.additionalAllowedCp = $.QUOTATION_MARK; 1496 this.returnState = this.state; 1497 this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE; 1498 } 1499 1500 else if (cp === $.NULL) 1501 this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER; 1502 1503 else if (cp === $.EOF) 1504 this._reconsumeInState(DATA_STATE); 1505 1506 else 1507 this.currentAttr.value += toChar(cp); 1508}; 1509 1510 1511//12.2.4.39 Attribute value (single-quoted) state 1512//------------------------------------------------------------------ 1513_[ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE] = function attributeValueSingleQuotedState(cp) { 1514 if (cp === $.APOSTROPHE) 1515 this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; 1516 1517 else if (cp === $.AMPERSAND) { 1518 this.additionalAllowedCp = $.APOSTROPHE; 1519 this.returnState = this.state; 1520 this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE; 1521 } 1522 1523 else if (cp === $.NULL) 1524 this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER; 1525 1526 else if (cp === $.EOF) 1527 this._reconsumeInState(DATA_STATE); 1528 1529 else 1530 this.currentAttr.value += toChar(cp); 1531}; 1532 1533 1534//12.2.4.40 Attribute value (unquoted) state 1535//------------------------------------------------------------------ 1536_[ATTRIBUTE_VALUE_UNQUOTED_STATE] = function attributeValueUnquotedState(cp) { 1537 if (isWhitespace(cp)) 1538 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 1539 1540 else if (cp === $.AMPERSAND) { 1541 this.additionalAllowedCp = $.GREATER_THAN_SIGN; 1542 this.returnState = this.state; 1543 this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE; 1544 } 1545 1546 else if (cp === $.GREATER_THAN_SIGN) { 1547 this.state = DATA_STATE; 1548 this._emitCurrentToken(); 1549 } 1550 1551 else if (cp === $.NULL) 1552 this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER; 1553 1554 else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN || 1555 cp === $.EQUALS_SIGN || cp === $.GRAVE_ACCENT) { 1556 this.currentAttr.value += toChar(cp); 1557 } 1558 1559 else if (cp === $.EOF) 1560 this._reconsumeInState(DATA_STATE); 1561 1562 else 1563 this.currentAttr.value += toChar(cp); 1564}; 1565 1566 1567//12.2.4.41 Character reference in attribute value state 1568//------------------------------------------------------------------ 1569_[CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE] = function characterReferenceInAttributeValueState(cp) { 1570 var referencedCodePoints = this._consumeCharacterReference(cp, true); 1571 1572 if (referencedCodePoints) { 1573 for (var i = 0; i < referencedCodePoints.length; i++) 1574 this.currentAttr.value += toChar(referencedCodePoints[i]); 1575 } else 1576 this.currentAttr.value += '&'; 1577 1578 this.state = this.returnState; 1579}; 1580 1581 1582//12.2.4.42 After attribute value (quoted) state 1583//------------------------------------------------------------------ 1584_[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE] = function afterAttributeValueQuotedState(cp) { 1585 if (isWhitespace(cp)) 1586 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 1587 1588 else if (cp === $.SOLIDUS) 1589 this.state = SELF_CLOSING_START_TAG_STATE; 1590 1591 else if (cp === $.GREATER_THAN_SIGN) { 1592 this.state = DATA_STATE; 1593 this._emitCurrentToken(); 1594 } 1595 1596 else if (cp === $.EOF) 1597 this._reconsumeInState(DATA_STATE); 1598 1599 else 1600 this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE); 1601}; 1602 1603 1604//12.2.4.43 Self-closing start tag state 1605//------------------------------------------------------------------ 1606_[SELF_CLOSING_START_TAG_STATE] = function selfClosingStartTagState(cp) { 1607 if (cp === $.GREATER_THAN_SIGN) { 1608 this.currentToken.selfClosing = true; 1609 this.state = DATA_STATE; 1610 this._emitCurrentToken(); 1611 } 1612 1613 else if (cp === $.EOF) 1614 this._reconsumeInState(DATA_STATE); 1615 1616 else 1617 this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE); 1618}; 1619 1620 1621//12.2.4.44 Bogus comment state 1622//------------------------------------------------------------------ 1623_[BOGUS_COMMENT_STATE] = function bogusCommentState(cp) { 1624 this._createCommentToken(); 1625 1626 while (true) { 1627 if (cp === $.GREATER_THAN_SIGN) { 1628 this.state = DATA_STATE; 1629 break; 1630 } 1631 1632 else if (cp === $.EOF) { 1633 this._reconsumeInState(DATA_STATE); 1634 break; 1635 } 1636 1637 else { 1638 this.currentToken.data += cp === $.NULL ? UNICODE.REPLACEMENT_CHARACTER : toChar(cp); 1639 cp = this._consume(); 1640 } 1641 } 1642 1643 this._emitCurrentToken(); 1644}; 1645 1646 1647//12.2.4.45 Markup declaration open state 1648//------------------------------------------------------------------ 1649_[MARKUP_DECLARATION_OPEN_STATE] = function markupDeclarationOpenState(cp) { 1650 if (this._consumeSubsequentIfMatch($$.DASH_DASH_STRING, cp, true)) { 1651 this._createCommentToken(); 1652 this.state = COMMENT_START_STATE; 1653 } 1654 1655 else if (this._consumeSubsequentIfMatch($$.DOCTYPE_STRING, cp, false)) 1656 this.state = DOCTYPE_STATE; 1657 1658 else if (this.allowCDATA && this._consumeSubsequentIfMatch($$.CDATA_START_STRING, cp, true)) 1659 this.state = CDATA_SECTION_STATE; 1660 1661 else { 1662 //NOTE: call bogus comment state directly with current consumed character to avoid unnecessary reconsumption. 1663 this[BOGUS_COMMENT_STATE](cp); 1664 } 1665}; 1666 1667 1668//12.2.4.46 Comment start state 1669//------------------------------------------------------------------ 1670_[COMMENT_START_STATE] = function commentStartState(cp) { 1671 if (cp === $.HYPHEN_MINUS) 1672 this.state = COMMENT_START_DASH_STATE; 1673 1674 else if (cp === $.NULL) { 1675 this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER; 1676 this.state = COMMENT_STATE; 1677 } 1678 1679 else if (cp === $.GREATER_THAN_SIGN) { 1680 this.state = DATA_STATE; 1681 this._emitCurrentToken(); 1682 } 1683 1684 else if (cp === $.EOF) { 1685 this._emitCurrentToken(); 1686 this._reconsumeInState(DATA_STATE); 1687 } 1688 1689 else { 1690 this.currentToken.data += toChar(cp); 1691 this.state = COMMENT_STATE; 1692 } 1693}; 1694 1695 1696//12.2.4.47 Comment start dash state 1697//------------------------------------------------------------------ 1698_[COMMENT_START_DASH_STATE] = function commentStartDashState(cp) { 1699 if (cp === $.HYPHEN_MINUS) 1700 this.state = COMMENT_END_STATE; 1701 1702 else if (cp === $.NULL) { 1703 this.currentToken.data += '-'; 1704 this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER; 1705 this.state = COMMENT_STATE; 1706 } 1707 1708 else if (cp === $.GREATER_THAN_SIGN) { 1709 this.state = DATA_STATE; 1710 this._emitCurrentToken(); 1711 } 1712 1713 else if (cp === $.EOF) { 1714 this._emitCurrentToken(); 1715 this._reconsumeInState(DATA_STATE); 1716 } 1717 1718 else { 1719 this.currentToken.data += '-'; 1720 this.currentToken.data += toChar(cp); 1721 this.state = COMMENT_STATE; 1722 } 1723}; 1724 1725 1726//12.2.4.48 Comment state 1727//------------------------------------------------------------------ 1728_[COMMENT_STATE] = function commentState(cp) { 1729 if (cp === $.HYPHEN_MINUS) 1730 this.state = COMMENT_END_DASH_STATE; 1731 1732 else if (cp === $.NULL) 1733 this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER; 1734 1735 else if (cp === $.EOF) { 1736 this._emitCurrentToken(); 1737 this._reconsumeInState(DATA_STATE); 1738 } 1739 1740 else 1741 this.currentToken.data += toChar(cp); 1742}; 1743 1744 1745//12.2.4.49 Comment end dash state 1746//------------------------------------------------------------------ 1747_[COMMENT_END_DASH_STATE] = function commentEndDashState(cp) { 1748 if (cp === $.HYPHEN_MINUS) 1749 this.state = COMMENT_END_STATE; 1750 1751 else if (cp === $.NULL) { 1752 this.currentToken.data += '-'; 1753 this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER; 1754 this.state = COMMENT_STATE; 1755 } 1756 1757 else if (cp === $.EOF) { 1758 this._emitCurrentToken(); 1759 this._reconsumeInState(DATA_STATE); 1760 } 1761 1762 else { 1763 this.currentToken.data += '-'; 1764 this.currentToken.data += toChar(cp); 1765 this.state = COMMENT_STATE; 1766 } 1767}; 1768 1769 1770//12.2.4.50 Comment end state 1771//------------------------------------------------------------------ 1772_[COMMENT_END_STATE] = function commentEndState(cp) { 1773 if (cp === $.GREATER_THAN_SIGN) { 1774 this.state = DATA_STATE; 1775 this._emitCurrentToken(); 1776 } 1777 1778 else if (cp === $.EXCLAMATION_MARK) 1779 this.state = COMMENT_END_BANG_STATE; 1780 1781 else if (cp === $.HYPHEN_MINUS) 1782 this.currentToken.data += '-'; 1783 1784 else if (cp === $.NULL) { 1785 this.currentToken.data += '--'; 1786 this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER; 1787 this.state = COMMENT_STATE; 1788 } 1789 1790 else if (cp === $.EOF) { 1791 this._reconsumeInState(DATA_STATE); 1792 this._emitCurrentToken(); 1793 } 1794 1795 else { 1796 this.currentToken.data += '--'; 1797 this.currentToken.data += toChar(cp); 1798 this.state = COMMENT_STATE; 1799 } 1800}; 1801 1802 1803//12.2.4.51 Comment end bang state 1804//------------------------------------------------------------------ 1805_[COMMENT_END_BANG_STATE] = function commentEndBangState(cp) { 1806 if (cp === $.HYPHEN_MINUS) { 1807 this.currentToken.data += '--!'; 1808 this.state = COMMENT_END_DASH_STATE; 1809 } 1810 1811 else if (cp === $.GREATER_THAN_SIGN) { 1812 this.state = DATA_STATE; 1813 this._emitCurrentToken(); 1814 } 1815 1816 else if (cp === $.NULL) { 1817 this.currentToken.data += '--!'; 1818 this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER; 1819 this.state = COMMENT_STATE; 1820 } 1821 1822 else if (cp === $.EOF) { 1823 this._emitCurrentToken(); 1824 this._reconsumeInState(DATA_STATE); 1825 } 1826 1827 else { 1828 this.currentToken.data += '--!'; 1829 this.currentToken.data += toChar(cp); 1830 this.state = COMMENT_STATE; 1831 } 1832}; 1833 1834 1835//12.2.4.52 DOCTYPE state 1836//------------------------------------------------------------------ 1837_[DOCTYPE_STATE] = function doctypeState(cp) { 1838 if (isWhitespace(cp)) 1839 this.state = BEFORE_DOCTYPE_NAME_STATE; 1840 1841 else if (cp === $.EOF) { 1842 this._createDoctypeToken(); 1843 this.currentToken.forceQuirks = true; 1844 this._emitCurrentToken(); 1845 this._reconsumeInState(DATA_STATE); 1846 } 1847 1848 else 1849 this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE); 1850}; 1851 1852 1853//12.2.4.53 Before DOCTYPE name state 1854//------------------------------------------------------------------ 1855_[BEFORE_DOCTYPE_NAME_STATE] = function beforeDoctypeNameState(cp) { 1856 if (isWhitespace(cp)) 1857 return; 1858 1859 if (isAsciiUpper(cp)) { 1860 this._createDoctypeToken(toAsciiLowerChar(cp)); 1861 this.state = DOCTYPE_NAME_STATE; 1862 } 1863 1864 else if (cp === $.GREATER_THAN_SIGN) { 1865 this._createDoctypeToken(); 1866 this.currentToken.forceQuirks = true; 1867 this._emitCurrentToken(); 1868 this.state = DATA_STATE; 1869 } 1870 1871 else if (cp === $.EOF) { 1872 this._createDoctypeToken(); 1873 this.currentToken.forceQuirks = true; 1874 this._emitCurrentToken(); 1875 this._reconsumeInState(DATA_STATE); 1876 } 1877 1878 else if (cp === $.NULL) { 1879 this._createDoctypeToken(UNICODE.REPLACEMENT_CHARACTER); 1880 this.state = DOCTYPE_NAME_STATE; 1881 } 1882 1883 else { 1884 this._createDoctypeToken(toChar(cp)); 1885 this.state = DOCTYPE_NAME_STATE; 1886 } 1887}; 1888 1889 1890//12.2.4.54 DOCTYPE name state 1891//------------------------------------------------------------------ 1892_[DOCTYPE_NAME_STATE] = function doctypeNameState(cp) { 1893 if (isWhitespace(cp)) 1894 this.state = AFTER_DOCTYPE_NAME_STATE; 1895 1896 else if (cp === $.GREATER_THAN_SIGN) { 1897 this._emitCurrentToken(); 1898 this.state = DATA_STATE; 1899 } 1900 1901 else if (isAsciiUpper(cp)) 1902 this.currentToken.name += toAsciiLowerChar(cp); 1903 1904 else if (cp === $.NULL) 1905 this.currentToken.name += UNICODE.REPLACEMENT_CHARACTER; 1906 1907 else if (cp === $.EOF) { 1908 this.currentToken.forceQuirks = true; 1909 this._emitCurrentToken(); 1910 this._reconsumeInState(DATA_STATE); 1911 } 1912 1913 else 1914 this.currentToken.name += toChar(cp); 1915}; 1916 1917 1918//12.2.4.55 After DOCTYPE name state 1919//------------------------------------------------------------------ 1920_[AFTER_DOCTYPE_NAME_STATE] = function afterDoctypeNameState(cp) { 1921 if (isWhitespace(cp)) 1922 return; 1923 1924 if (cp === $.GREATER_THAN_SIGN) { 1925 this.state = DATA_STATE; 1926 this._emitCurrentToken(); 1927 } 1928 1929 else if (cp === $.EOF) { 1930 this.currentToken.forceQuirks = true; 1931 this._emitCurrentToken(); 1932 this._reconsumeInState(DATA_STATE); 1933 } 1934 1935 else if (this._consumeSubsequentIfMatch($$.PUBLIC_STRING, cp, false)) 1936 this.state = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE; 1937 1938 else if (this._consumeSubsequentIfMatch($$.SYSTEM_STRING, cp, false)) 1939 this.state = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE; 1940 1941 else { 1942 this.currentToken.forceQuirks = true; 1943 this.state = BOGUS_DOCTYPE_STATE; 1944 } 1945}; 1946 1947 1948//12.2.4.56 After DOCTYPE public keyword state 1949//------------------------------------------------------------------ 1950_[AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE] = function afterDoctypePublicKeywordState(cp) { 1951 if (isWhitespace(cp)) 1952 this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 1953 1954 else if (cp === $.QUOTATION_MARK) { 1955 this.currentToken.publicId = ''; 1956 this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; 1957 } 1958 1959 else if (cp === $.APOSTROPHE) { 1960 this.currentToken.publicId = ''; 1961 this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; 1962 } 1963 1964 else if (cp === $.GREATER_THAN_SIGN) { 1965 this.currentToken.forceQuirks = true; 1966 this._emitCurrentToken(); 1967 this.state = DATA_STATE; 1968 } 1969 1970 else if (cp === $.EOF) { 1971 this.currentToken.forceQuirks = true; 1972 this._emitCurrentToken(); 1973 this._reconsumeInState(DATA_STATE); 1974 } 1975 1976 else { 1977 this.currentToken.forceQuirks = true; 1978 this.state = BOGUS_DOCTYPE_STATE; 1979 } 1980}; 1981 1982 1983//12.2.4.57 Before DOCTYPE public identifier state 1984//------------------------------------------------------------------ 1985_[BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE] = function beforeDoctypePublicIdentifierState(cp) { 1986 if (isWhitespace(cp)) 1987 return; 1988 1989 if (cp === $.QUOTATION_MARK) { 1990 this.currentToken.publicId = ''; 1991 this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; 1992 } 1993 1994 else if (cp === $.APOSTROPHE) { 1995 this.currentToken.publicId = ''; 1996 this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; 1997 } 1998 1999 else if (cp === $.GREATER_THAN_SIGN) { 2000 this.currentToken.forceQuirks = true; 2001 this._emitCurrentToken(); 2002 this.state = DATA_STATE; 2003 } 2004 2005 else if (cp === $.EOF) { 2006 this.currentToken.forceQuirks = true; 2007 this._emitCurrentToken(); 2008 this._reconsumeInState(DATA_STATE); 2009 } 2010 2011 else { 2012 this.currentToken.forceQuirks = true; 2013 this.state = BOGUS_DOCTYPE_STATE; 2014 } 2015}; 2016 2017 2018//12.2.4.58 DOCTYPE public identifier (double-quoted) state 2019//------------------------------------------------------------------ 2020_[DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypePublicIdentifierDoubleQuotedState(cp) { 2021 if (cp === $.QUOTATION_MARK) 2022 this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 2023 2024 else if (cp === $.NULL) 2025 this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER; 2026 2027 else if (cp === $.GREATER_THAN_SIGN) { 2028 this.currentToken.forceQuirks = true; 2029 this._emitCurrentToken(); 2030 this.state = DATA_STATE; 2031 } 2032 2033 else if (cp === $.EOF) { 2034 this.currentToken.forceQuirks = true; 2035 this._emitCurrentToken(); 2036 this._reconsumeInState(DATA_STATE); 2037 } 2038 2039 else 2040 this.currentToken.publicId += toChar(cp); 2041}; 2042 2043 2044//12.2.4.59 DOCTYPE public identifier (single-quoted) state 2045//------------------------------------------------------------------ 2046_[DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypePublicIdentifierSingleQuotedState(cp) { 2047 if (cp === $.APOSTROPHE) 2048 this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 2049 2050 else if (cp === $.NULL) 2051 this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER; 2052 2053 else if (cp === $.GREATER_THAN_SIGN) { 2054 this.currentToken.forceQuirks = true; 2055 this._emitCurrentToken(); 2056 this.state = DATA_STATE; 2057 } 2058 2059 else if (cp === $.EOF) { 2060 this.currentToken.forceQuirks = true; 2061 this._emitCurrentToken(); 2062 this._reconsumeInState(DATA_STATE); 2063 } 2064 2065 else 2066 this.currentToken.publicId += toChar(cp); 2067}; 2068 2069 2070//12.2.4.60 After DOCTYPE public identifier state 2071//------------------------------------------------------------------ 2072_[AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE] = function afterDoctypePublicIdentifierState(cp) { 2073 if (isWhitespace(cp)) 2074 this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE; 2075 2076 else if (cp === $.GREATER_THAN_SIGN) { 2077 this._emitCurrentToken(); 2078 this.state = DATA_STATE; 2079 } 2080 2081 else if (cp === $.QUOTATION_MARK) { 2082 this.currentToken.systemId = ''; 2083 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 2084 } 2085 2086 else if (cp === $.APOSTROPHE) { 2087 this.currentToken.systemId = ''; 2088 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 2089 } 2090 2091 else if (cp === $.EOF) { 2092 this.currentToken.forceQuirks = true; 2093 this._emitCurrentToken(); 2094 this._reconsumeInState(DATA_STATE); 2095 } 2096 2097 else { 2098 this.currentToken.forceQuirks = true; 2099 this.state = BOGUS_DOCTYPE_STATE; 2100 } 2101}; 2102 2103 2104//12.2.4.61 Between DOCTYPE public and system identifiers state 2105//------------------------------------------------------------------ 2106_[BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE] = function betweenDoctypePublicAndSystemIdentifiersState(cp) { 2107 if (isWhitespace(cp)) 2108 return; 2109 2110 if (cp === $.GREATER_THAN_SIGN) { 2111 this._emitCurrentToken(); 2112 this.state = DATA_STATE; 2113 } 2114 2115 else if (cp === $.QUOTATION_MARK) { 2116 this.currentToken.systemId = ''; 2117 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 2118 } 2119 2120 2121 else if (cp === $.APOSTROPHE) { 2122 this.currentToken.systemId = ''; 2123 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 2124 } 2125 2126 else if (cp === $.EOF) { 2127 this.currentToken.forceQuirks = true; 2128 this._emitCurrentToken(); 2129 this._reconsumeInState(DATA_STATE); 2130 } 2131 2132 else { 2133 this.currentToken.forceQuirks = true; 2134 this.state = BOGUS_DOCTYPE_STATE; 2135 } 2136}; 2137 2138 2139//12.2.4.62 After DOCTYPE system keyword state 2140//------------------------------------------------------------------ 2141_[AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE] = function afterDoctypeSystemKeywordState(cp) { 2142 if (isWhitespace(cp)) 2143 this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 2144 2145 else if (cp === $.QUOTATION_MARK) { 2146 this.currentToken.systemId = ''; 2147 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 2148 } 2149 2150 else if (cp === $.APOSTROPHE) { 2151 this.currentToken.systemId = ''; 2152 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 2153 } 2154 2155 else if (cp === $.GREATER_THAN_SIGN) { 2156 this.currentToken.forceQuirks = true; 2157 this._emitCurrentToken(); 2158 this.state = DATA_STATE; 2159 } 2160 2161 else if (cp === $.EOF) { 2162 this.currentToken.forceQuirks = true; 2163 this._emitCurrentToken(); 2164 this._reconsumeInState(DATA_STATE); 2165 } 2166 2167 else { 2168 this.currentToken.forceQuirks = true; 2169 this.state = BOGUS_DOCTYPE_STATE; 2170 } 2171}; 2172 2173 2174//12.2.4.63 Before DOCTYPE system identifier state 2175//------------------------------------------------------------------ 2176_[BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function beforeDoctypeSystemIdentifierState(cp) { 2177 if (isWhitespace(cp)) 2178 return; 2179 2180 if (cp === $.QUOTATION_MARK) { 2181 this.currentToken.systemId = ''; 2182 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 2183 } 2184 2185 else if (cp === $.APOSTROPHE) { 2186 this.currentToken.systemId = ''; 2187 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 2188 } 2189 2190 else if (cp === $.GREATER_THAN_SIGN) { 2191 this.currentToken.forceQuirks = true; 2192 this._emitCurrentToken(); 2193 this.state = DATA_STATE; 2194 } 2195 2196 else if (cp === $.EOF) { 2197 this.currentToken.forceQuirks = true; 2198 this._emitCurrentToken(); 2199 this._reconsumeInState(DATA_STATE); 2200 } 2201 2202 else { 2203 this.currentToken.forceQuirks = true; 2204 this.state = BOGUS_DOCTYPE_STATE; 2205 } 2206}; 2207 2208 2209//12.2.4.64 DOCTYPE system identifier (double-quoted) state 2210//------------------------------------------------------------------ 2211_[DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypeSystemIdentifierDoubleQuotedState(cp) { 2212 if (cp === $.QUOTATION_MARK) 2213 this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 2214 2215 else if (cp === $.GREATER_THAN_SIGN) { 2216 this.currentToken.forceQuirks = true; 2217 this._emitCurrentToken(); 2218 this.state = DATA_STATE; 2219 } 2220 2221 else if (cp === $.NULL) 2222 this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER; 2223 2224 else if (cp === $.EOF) { 2225 this.currentToken.forceQuirks = true; 2226 this._emitCurrentToken(); 2227 this._reconsumeInState(DATA_STATE); 2228 } 2229 2230 else 2231 this.currentToken.systemId += toChar(cp); 2232}; 2233 2234 2235//12.2.4.65 DOCTYPE system identifier (single-quoted) state 2236//------------------------------------------------------------------ 2237_[DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypeSystemIdentifierSingleQuotedState(cp) { 2238 if (cp === $.APOSTROPHE) 2239 this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 2240 2241 else if (cp === $.GREATER_THAN_SIGN) { 2242 this.currentToken.forceQuirks = true; 2243 this._emitCurrentToken(); 2244 this.state = DATA_STATE; 2245 } 2246 2247 else if (cp === $.NULL) 2248 this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER; 2249 2250 else if (cp === $.EOF) { 2251 this.currentToken.forceQuirks = true; 2252 this._emitCurrentToken(); 2253 this._reconsumeInState(DATA_STATE); 2254 } 2255 2256 else 2257 this.currentToken.systemId += toChar(cp); 2258}; 2259 2260 2261//12.2.4.66 After DOCTYPE system identifier state 2262//------------------------------------------------------------------ 2263_[AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function afterDoctypeSystemIdentifierState(cp) { 2264 if (isWhitespace(cp)) 2265 return; 2266 2267 if (cp === $.GREATER_THAN_SIGN) { 2268 this._emitCurrentToken(); 2269 this.state = DATA_STATE; 2270 } 2271 2272 else if (cp === $.EOF) { 2273 this.currentToken.forceQuirks = true; 2274 this._emitCurrentToken(); 2275 this._reconsumeInState(DATA_STATE); 2276 } 2277 2278 else 2279 this.state = BOGUS_DOCTYPE_STATE; 2280}; 2281 2282 2283//12.2.4.67 Bogus DOCTYPE state 2284//------------------------------------------------------------------ 2285_[BOGUS_DOCTYPE_STATE] = function bogusDoctypeState(cp) { 2286 if (cp === $.GREATER_THAN_SIGN) { 2287 this._emitCurrentToken(); 2288 this.state = DATA_STATE; 2289 } 2290 2291 else if (cp === $.EOF) { 2292 this._emitCurrentToken(); 2293 this._reconsumeInState(DATA_STATE); 2294 } 2295}; 2296 2297 2298//12.2.4.68 CDATA section state 2299//------------------------------------------------------------------ 2300_[CDATA_SECTION_STATE] = function cdataSectionState(cp) { 2301 while (true) { 2302 if (cp === $.EOF) { 2303 this._reconsumeInState(DATA_STATE); 2304 break; 2305 } 2306 2307 else if (this._consumeSubsequentIfMatch($$.CDATA_END_STRING, cp, true)) { 2308 this.state = DATA_STATE; 2309 break; 2310 } 2311 2312 else { 2313 this._emitCodePoint(cp); 2314 cp = this._consume(); 2315 } 2316 } 2317}; 2318