1'use strict'; 2 3const Preprocessor = require('./preprocessor'); 4const unicode = require('../common/unicode'); 5const neTree = require('./named-entity-data'); 6const ERR = require('../common/error-codes'); 7 8//Aliases 9const $ = unicode.CODE_POINTS; 10const $$ = unicode.CODE_POINT_SEQUENCES; 11 12//C1 Unicode control character reference replacements 13const C1_CONTROLS_REFERENCE_REPLACEMENTS = { 14 0x80: 0x20ac, 15 0x82: 0x201a, 16 0x83: 0x0192, 17 0x84: 0x201e, 18 0x85: 0x2026, 19 0x86: 0x2020, 20 0x87: 0x2021, 21 0x88: 0x02c6, 22 0x89: 0x2030, 23 0x8a: 0x0160, 24 0x8b: 0x2039, 25 0x8c: 0x0152, 26 0x8e: 0x017d, 27 0x91: 0x2018, 28 0x92: 0x2019, 29 0x93: 0x201c, 30 0x94: 0x201d, 31 0x95: 0x2022, 32 0x96: 0x2013, 33 0x97: 0x2014, 34 0x98: 0x02dc, 35 0x99: 0x2122, 36 0x9a: 0x0161, 37 0x9b: 0x203a, 38 0x9c: 0x0153, 39 0x9e: 0x017e, 40 0x9f: 0x0178 41}; 42 43// Named entity tree flags 44const HAS_DATA_FLAG = 1 << 0; 45const DATA_DUPLET_FLAG = 1 << 1; 46const HAS_BRANCHES_FLAG = 1 << 2; 47const MAX_BRANCH_MARKER_VALUE = HAS_DATA_FLAG | DATA_DUPLET_FLAG | HAS_BRANCHES_FLAG; 48 49//States 50const DATA_STATE = 'DATA_STATE'; 51const RCDATA_STATE = 'RCDATA_STATE'; 52const RAWTEXT_STATE = 'RAWTEXT_STATE'; 53const SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE'; 54const PLAINTEXT_STATE = 'PLAINTEXT_STATE'; 55const TAG_OPEN_STATE = 'TAG_OPEN_STATE'; 56const END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE'; 57const TAG_NAME_STATE = 'TAG_NAME_STATE'; 58const RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE'; 59const RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE'; 60const RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE'; 61const RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE'; 62const RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE'; 63const RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE'; 64const SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE'; 65const SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE'; 66const SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE'; 67const SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE'; 68const SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE'; 69const SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE'; 70const SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE'; 71const SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE'; 72const SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE'; 73const SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE'; 74const SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE'; 75const SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE'; 76const SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE'; 77const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE'; 78const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE'; 79const SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE'; 80const SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE'; 81const BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE'; 82const ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE'; 83const AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE'; 84const BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE'; 85const ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE'; 86const ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE'; 87const ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE'; 88const AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE'; 89const SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE'; 90const BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE'; 91const MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE'; 92const COMMENT_START_STATE = 'COMMENT_START_STATE'; 93const COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE'; 94const COMMENT_STATE = 'COMMENT_STATE'; 95const COMMENT_LESS_THAN_SIGN_STATE = 'COMMENT_LESS_THAN_SIGN_STATE'; 96const COMMENT_LESS_THAN_SIGN_BANG_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_STATE'; 97const COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE'; 98const COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE'; 99const COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE'; 100const COMMENT_END_STATE = 'COMMENT_END_STATE'; 101const COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE'; 102const DOCTYPE_STATE = 'DOCTYPE_STATE'; 103const BEFORE_DOCTYPE_NAME_STATE = 'BEFORE_DOCTYPE_NAME_STATE'; 104const DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE'; 105const AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE'; 106const AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 'AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE'; 107const BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE'; 108const DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE'; 109const DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE'; 110const AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE'; 111const BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE'; 112const AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 'AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE'; 113const BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE'; 114const DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE'; 115const DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE'; 116const AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE'; 117const BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE'; 118const CDATA_SECTION_STATE = 'CDATA_SECTION_STATE'; 119const CDATA_SECTION_BRACKET_STATE = 'CDATA_SECTION_BRACKET_STATE'; 120const CDATA_SECTION_END_STATE = 'CDATA_SECTION_END_STATE'; 121const CHARACTER_REFERENCE_STATE = 'CHARACTER_REFERENCE_STATE'; 122const NAMED_CHARACTER_REFERENCE_STATE = 'NAMED_CHARACTER_REFERENCE_STATE'; 123const AMBIGUOUS_AMPERSAND_STATE = 'AMBIGUOS_AMPERSAND_STATE'; 124const NUMERIC_CHARACTER_REFERENCE_STATE = 'NUMERIC_CHARACTER_REFERENCE_STATE'; 125const HEXADEMICAL_CHARACTER_REFERENCE_START_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_START_STATE'; 126const DECIMAL_CHARACTER_REFERENCE_START_STATE = 'DECIMAL_CHARACTER_REFERENCE_START_STATE'; 127const HEXADEMICAL_CHARACTER_REFERENCE_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_STATE'; 128const DECIMAL_CHARACTER_REFERENCE_STATE = 'DECIMAL_CHARACTER_REFERENCE_STATE'; 129const NUMERIC_CHARACTER_REFERENCE_END_STATE = 'NUMERIC_CHARACTER_REFERENCE_END_STATE'; 130 131//Utils 132 133//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline 134//this functions if they will be situated in another module due to context switch. 135//Always perform inlining check before modifying this functions ('node --trace-inlining'). 136function isWhitespace(cp) { 137 return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED; 138} 139 140function isAsciiDigit(cp) { 141 return cp >= $.DIGIT_0 && cp <= $.DIGIT_9; 142} 143 144function isAsciiUpper(cp) { 145 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z; 146} 147 148function isAsciiLower(cp) { 149 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z; 150} 151 152function isAsciiLetter(cp) { 153 return isAsciiLower(cp) || isAsciiUpper(cp); 154} 155 156function isAsciiAlphaNumeric(cp) { 157 return isAsciiLetter(cp) || isAsciiDigit(cp); 158} 159 160function isAsciiUpperHexDigit(cp) { 161 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F; 162} 163 164function isAsciiLowerHexDigit(cp) { 165 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F; 166} 167 168function isAsciiHexDigit(cp) { 169 return isAsciiDigit(cp) || isAsciiUpperHexDigit(cp) || isAsciiLowerHexDigit(cp); 170} 171 172function toAsciiLowerCodePoint(cp) { 173 return cp + 0x0020; 174} 175 176//NOTE: String.fromCharCode() function can handle only characters from BMP subset. 177//So, we need to workaround this manually. 178//(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values) 179function toChar(cp) { 180 if (cp <= 0xffff) { 181 return String.fromCharCode(cp); 182 } 183 184 cp -= 0x10000; 185 return String.fromCharCode(((cp >>> 10) & 0x3ff) | 0xd800) + String.fromCharCode(0xdc00 | (cp & 0x3ff)); 186} 187 188function toAsciiLowerChar(cp) { 189 return String.fromCharCode(toAsciiLowerCodePoint(cp)); 190} 191 192function findNamedEntityTreeBranch(nodeIx, cp) { 193 const branchCount = neTree[++nodeIx]; 194 let lo = ++nodeIx; 195 let hi = lo + branchCount - 1; 196 197 while (lo <= hi) { 198 const mid = (lo + hi) >>> 1; 199 const midCp = neTree[mid]; 200 201 if (midCp < cp) { 202 lo = mid + 1; 203 } else if (midCp > cp) { 204 hi = mid - 1; 205 } else { 206 return neTree[mid + branchCount]; 207 } 208 } 209 210 return -1; 211} 212 213//Tokenizer 214class Tokenizer { 215 constructor() { 216 this.preprocessor = new Preprocessor(); 217 218 this.tokenQueue = []; 219 220 this.allowCDATA = false; 221 222 this.state = DATA_STATE; 223 this.returnState = ''; 224 225 this.charRefCode = -1; 226 this.tempBuff = []; 227 this.lastStartTagName = ''; 228 229 this.consumedAfterSnapshot = -1; 230 this.active = false; 231 232 this.currentCharacterToken = null; 233 this.currentToken = null; 234 this.currentAttr = null; 235 } 236 237 //Errors 238 _err() { 239 // NOTE: err reporting is noop by default. Enabled by mixin. 240 } 241 242 _errOnNextCodePoint(err) { 243 this._consume(); 244 this._err(err); 245 this._unconsume(); 246 } 247 248 //API 249 getNextToken() { 250 while (!this.tokenQueue.length && this.active) { 251 this.consumedAfterSnapshot = 0; 252 253 const cp = this._consume(); 254 255 if (!this._ensureHibernation()) { 256 this[this.state](cp); 257 } 258 } 259 260 return this.tokenQueue.shift(); 261 } 262 263 write(chunk, isLastChunk) { 264 this.active = true; 265 this.preprocessor.write(chunk, isLastChunk); 266 } 267 268 insertHtmlAtCurrentPos(chunk) { 269 this.active = true; 270 this.preprocessor.insertHtmlAtCurrentPos(chunk); 271 } 272 273 //Hibernation 274 _ensureHibernation() { 275 if (this.preprocessor.endOfChunkHit) { 276 for (; this.consumedAfterSnapshot > 0; this.consumedAfterSnapshot--) { 277 this.preprocessor.retreat(); 278 } 279 280 this.active = false; 281 this.tokenQueue.push({ type: Tokenizer.HIBERNATION_TOKEN }); 282 283 return true; 284 } 285 286 return false; 287 } 288 289 //Consumption 290 _consume() { 291 this.consumedAfterSnapshot++; 292 return this.preprocessor.advance(); 293 } 294 295 _unconsume() { 296 this.consumedAfterSnapshot--; 297 this.preprocessor.retreat(); 298 } 299 300 _reconsumeInState(state) { 301 this.state = state; 302 this._unconsume(); 303 } 304 305 _consumeSequenceIfMatch(pattern, startCp, caseSensitive) { 306 let consumedCount = 0; 307 let isMatch = true; 308 const patternLength = pattern.length; 309 let patternPos = 0; 310 let cp = startCp; 311 let patternCp = void 0; 312 313 for (; patternPos < patternLength; patternPos++) { 314 if (patternPos > 0) { 315 cp = this._consume(); 316 consumedCount++; 317 } 318 319 if (cp === $.EOF) { 320 isMatch = false; 321 break; 322 } 323 324 patternCp = pattern[patternPos]; 325 326 if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) { 327 isMatch = false; 328 break; 329 } 330 } 331 332 if (!isMatch) { 333 while (consumedCount--) { 334 this._unconsume(); 335 } 336 } 337 338 return isMatch; 339 } 340 341 //Temp buffer 342 _isTempBufferEqualToScriptString() { 343 if (this.tempBuff.length !== $$.SCRIPT_STRING.length) { 344 return false; 345 } 346 347 for (let i = 0; i < this.tempBuff.length; i++) { 348 if (this.tempBuff[i] !== $$.SCRIPT_STRING[i]) { 349 return false; 350 } 351 } 352 353 return true; 354 } 355 356 //Token creation 357 _createStartTagToken() { 358 this.currentToken = { 359 type: Tokenizer.START_TAG_TOKEN, 360 tagName: '', 361 selfClosing: false, 362 ackSelfClosing: false, 363 attrs: [] 364 }; 365 } 366 367 _createEndTagToken() { 368 this.currentToken = { 369 type: Tokenizer.END_TAG_TOKEN, 370 tagName: '', 371 selfClosing: false, 372 attrs: [] 373 }; 374 } 375 376 _createCommentToken() { 377 this.currentToken = { 378 type: Tokenizer.COMMENT_TOKEN, 379 data: '' 380 }; 381 } 382 383 _createDoctypeToken(initialName) { 384 this.currentToken = { 385 type: Tokenizer.DOCTYPE_TOKEN, 386 name: initialName, 387 forceQuirks: false, 388 publicId: null, 389 systemId: null 390 }; 391 } 392 393 _createCharacterToken(type, ch) { 394 this.currentCharacterToken = { 395 type: type, 396 chars: ch 397 }; 398 } 399 400 _createEOFToken() { 401 this.currentToken = { type: Tokenizer.EOF_TOKEN }; 402 } 403 404 //Tag attributes 405 _createAttr(attrNameFirstCh) { 406 this.currentAttr = { 407 name: attrNameFirstCh, 408 value: '' 409 }; 410 } 411 412 _leaveAttrName(toState) { 413 if (Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) === null) { 414 this.currentToken.attrs.push(this.currentAttr); 415 } else { 416 this._err(ERR.duplicateAttribute); 417 } 418 419 this.state = toState; 420 } 421 422 _leaveAttrValue(toState) { 423 this.state = toState; 424 } 425 426 //Token emission 427 _emitCurrentToken() { 428 this._emitCurrentCharacterToken(); 429 430 const ct = this.currentToken; 431 432 this.currentToken = null; 433 434 //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate. 435 if (ct.type === Tokenizer.START_TAG_TOKEN) { 436 this.lastStartTagName = ct.tagName; 437 } else if (ct.type === Tokenizer.END_TAG_TOKEN) { 438 if (ct.attrs.length > 0) { 439 this._err(ERR.endTagWithAttributes); 440 } 441 442 if (ct.selfClosing) { 443 this._err(ERR.endTagWithTrailingSolidus); 444 } 445 } 446 447 this.tokenQueue.push(ct); 448 } 449 450 _emitCurrentCharacterToken() { 451 if (this.currentCharacterToken) { 452 this.tokenQueue.push(this.currentCharacterToken); 453 this.currentCharacterToken = null; 454 } 455 } 456 457 _emitEOFToken() { 458 this._createEOFToken(); 459 this._emitCurrentToken(); 460 } 461 462 //Characters emission 463 464 //OPTIMIZATION: specification uses only one type of character tokens (one token per character). 465 //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters. 466 //If we have a sequence of characters that belong to the same group, parser can process it 467 //as a single solid character token. 468 //So, there are 3 types of character tokens in parse5: 469 //1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000') 470 //2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n \r\t \f') 471 //3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') 472 _appendCharToCurrentCharacterToken(type, ch) { 473 if (this.currentCharacterToken && this.currentCharacterToken.type !== type) { 474 this._emitCurrentCharacterToken(); 475 } 476 477 if (this.currentCharacterToken) { 478 this.currentCharacterToken.chars += ch; 479 } else { 480 this._createCharacterToken(type, ch); 481 } 482 } 483 484 _emitCodePoint(cp) { 485 let type = Tokenizer.CHARACTER_TOKEN; 486 487 if (isWhitespace(cp)) { 488 type = Tokenizer.WHITESPACE_CHARACTER_TOKEN; 489 } else if (cp === $.NULL) { 490 type = Tokenizer.NULL_CHARACTER_TOKEN; 491 } 492 493 this._appendCharToCurrentCharacterToken(type, toChar(cp)); 494 } 495 496 _emitSeveralCodePoints(codePoints) { 497 for (let i = 0; i < codePoints.length; i++) { 498 this._emitCodePoint(codePoints[i]); 499 } 500 } 501 502 //NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character. 503 //So we can avoid additional checks here. 504 _emitChars(ch) { 505 this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch); 506 } 507 508 // Character reference helpers 509 _matchNamedCharacterReference(startCp) { 510 let result = null; 511 let excess = 1; 512 let i = findNamedEntityTreeBranch(0, startCp); 513 514 this.tempBuff.push(startCp); 515 516 while (i > -1) { 517 const current = neTree[i]; 518 const inNode = current < MAX_BRANCH_MARKER_VALUE; 519 const nodeWithData = inNode && current & HAS_DATA_FLAG; 520 521 if (nodeWithData) { 522 //NOTE: we use greedy search, so we continue lookup at this point 523 result = current & DATA_DUPLET_FLAG ? [neTree[++i], neTree[++i]] : [neTree[++i]]; 524 excess = 0; 525 } 526 527 const cp = this._consume(); 528 529 this.tempBuff.push(cp); 530 excess++; 531 532 if (cp === $.EOF) { 533 break; 534 } 535 536 if (inNode) { 537 i = current & HAS_BRANCHES_FLAG ? findNamedEntityTreeBranch(i, cp) : -1; 538 } else { 539 i = cp === current ? ++i : -1; 540 } 541 } 542 543 while (excess--) { 544 this.tempBuff.pop(); 545 this._unconsume(); 546 } 547 548 return result; 549 } 550 551 _isCharacterReferenceInAttribute() { 552 return ( 553 this.returnState === ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE || 554 this.returnState === ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE || 555 this.returnState === ATTRIBUTE_VALUE_UNQUOTED_STATE 556 ); 557 } 558 559 _isCharacterReferenceAttributeQuirk(withSemicolon) { 560 if (!withSemicolon && this._isCharacterReferenceInAttribute()) { 561 const nextCp = this._consume(); 562 563 this._unconsume(); 564 565 return nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp); 566 } 567 568 return false; 569 } 570 571 _flushCodePointsConsumedAsCharacterReference() { 572 if (this._isCharacterReferenceInAttribute()) { 573 for (let i = 0; i < this.tempBuff.length; i++) { 574 this.currentAttr.value += toChar(this.tempBuff[i]); 575 } 576 } else { 577 this._emitSeveralCodePoints(this.tempBuff); 578 } 579 580 this.tempBuff = []; 581 } 582 583 // State machine 584 585 // Data state 586 //------------------------------------------------------------------ 587 [DATA_STATE](cp) { 588 this.preprocessor.dropParsedChunk(); 589 590 if (cp === $.LESS_THAN_SIGN) { 591 this.state = TAG_OPEN_STATE; 592 } else if (cp === $.AMPERSAND) { 593 this.returnState = DATA_STATE; 594 this.state = CHARACTER_REFERENCE_STATE; 595 } else if (cp === $.NULL) { 596 this._err(ERR.unexpectedNullCharacter); 597 this._emitCodePoint(cp); 598 } else if (cp === $.EOF) { 599 this._emitEOFToken(); 600 } else { 601 this._emitCodePoint(cp); 602 } 603 } 604 605 // RCDATA state 606 //------------------------------------------------------------------ 607 [RCDATA_STATE](cp) { 608 this.preprocessor.dropParsedChunk(); 609 610 if (cp === $.AMPERSAND) { 611 this.returnState = RCDATA_STATE; 612 this.state = CHARACTER_REFERENCE_STATE; 613 } else if (cp === $.LESS_THAN_SIGN) { 614 this.state = RCDATA_LESS_THAN_SIGN_STATE; 615 } else if (cp === $.NULL) { 616 this._err(ERR.unexpectedNullCharacter); 617 this._emitChars(unicode.REPLACEMENT_CHARACTER); 618 } else if (cp === $.EOF) { 619 this._emitEOFToken(); 620 } else { 621 this._emitCodePoint(cp); 622 } 623 } 624 625 // RAWTEXT state 626 //------------------------------------------------------------------ 627 [RAWTEXT_STATE](cp) { 628 this.preprocessor.dropParsedChunk(); 629 630 if (cp === $.LESS_THAN_SIGN) { 631 this.state = RAWTEXT_LESS_THAN_SIGN_STATE; 632 } else if (cp === $.NULL) { 633 this._err(ERR.unexpectedNullCharacter); 634 this._emitChars(unicode.REPLACEMENT_CHARACTER); 635 } else if (cp === $.EOF) { 636 this._emitEOFToken(); 637 } else { 638 this._emitCodePoint(cp); 639 } 640 } 641 642 // Script data state 643 //------------------------------------------------------------------ 644 [SCRIPT_DATA_STATE](cp) { 645 this.preprocessor.dropParsedChunk(); 646 647 if (cp === $.LESS_THAN_SIGN) { 648 this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE; 649 } else if (cp === $.NULL) { 650 this._err(ERR.unexpectedNullCharacter); 651 this._emitChars(unicode.REPLACEMENT_CHARACTER); 652 } else if (cp === $.EOF) { 653 this._emitEOFToken(); 654 } else { 655 this._emitCodePoint(cp); 656 } 657 } 658 659 // PLAINTEXT state 660 //------------------------------------------------------------------ 661 [PLAINTEXT_STATE](cp) { 662 this.preprocessor.dropParsedChunk(); 663 664 if (cp === $.NULL) { 665 this._err(ERR.unexpectedNullCharacter); 666 this._emitChars(unicode.REPLACEMENT_CHARACTER); 667 } else if (cp === $.EOF) { 668 this._emitEOFToken(); 669 } else { 670 this._emitCodePoint(cp); 671 } 672 } 673 674 // Tag open state 675 //------------------------------------------------------------------ 676 [TAG_OPEN_STATE](cp) { 677 if (cp === $.EXCLAMATION_MARK) { 678 this.state = MARKUP_DECLARATION_OPEN_STATE; 679 } else if (cp === $.SOLIDUS) { 680 this.state = END_TAG_OPEN_STATE; 681 } else if (isAsciiLetter(cp)) { 682 this._createStartTagToken(); 683 this._reconsumeInState(TAG_NAME_STATE); 684 } else if (cp === $.QUESTION_MARK) { 685 this._err(ERR.unexpectedQuestionMarkInsteadOfTagName); 686 this._createCommentToken(); 687 this._reconsumeInState(BOGUS_COMMENT_STATE); 688 } else if (cp === $.EOF) { 689 this._err(ERR.eofBeforeTagName); 690 this._emitChars('<'); 691 this._emitEOFToken(); 692 } else { 693 this._err(ERR.invalidFirstCharacterOfTagName); 694 this._emitChars('<'); 695 this._reconsumeInState(DATA_STATE); 696 } 697 } 698 699 // End tag open state 700 //------------------------------------------------------------------ 701 [END_TAG_OPEN_STATE](cp) { 702 if (isAsciiLetter(cp)) { 703 this._createEndTagToken(); 704 this._reconsumeInState(TAG_NAME_STATE); 705 } else if (cp === $.GREATER_THAN_SIGN) { 706 this._err(ERR.missingEndTagName); 707 this.state = DATA_STATE; 708 } else if (cp === $.EOF) { 709 this._err(ERR.eofBeforeTagName); 710 this._emitChars('</'); 711 this._emitEOFToken(); 712 } else { 713 this._err(ERR.invalidFirstCharacterOfTagName); 714 this._createCommentToken(); 715 this._reconsumeInState(BOGUS_COMMENT_STATE); 716 } 717 } 718 719 // Tag name state 720 //------------------------------------------------------------------ 721 [TAG_NAME_STATE](cp) { 722 if (isWhitespace(cp)) { 723 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 724 } else if (cp === $.SOLIDUS) { 725 this.state = SELF_CLOSING_START_TAG_STATE; 726 } else if (cp === $.GREATER_THAN_SIGN) { 727 this.state = DATA_STATE; 728 this._emitCurrentToken(); 729 } else if (isAsciiUpper(cp)) { 730 this.currentToken.tagName += toAsciiLowerChar(cp); 731 } else if (cp === $.NULL) { 732 this._err(ERR.unexpectedNullCharacter); 733 this.currentToken.tagName += unicode.REPLACEMENT_CHARACTER; 734 } else if (cp === $.EOF) { 735 this._err(ERR.eofInTag); 736 this._emitEOFToken(); 737 } else { 738 this.currentToken.tagName += toChar(cp); 739 } 740 } 741 742 // RCDATA less-than sign state 743 //------------------------------------------------------------------ 744 [RCDATA_LESS_THAN_SIGN_STATE](cp) { 745 if (cp === $.SOLIDUS) { 746 this.tempBuff = []; 747 this.state = RCDATA_END_TAG_OPEN_STATE; 748 } else { 749 this._emitChars('<'); 750 this._reconsumeInState(RCDATA_STATE); 751 } 752 } 753 754 // RCDATA end tag open state 755 //------------------------------------------------------------------ 756 [RCDATA_END_TAG_OPEN_STATE](cp) { 757 if (isAsciiLetter(cp)) { 758 this._createEndTagToken(); 759 this._reconsumeInState(RCDATA_END_TAG_NAME_STATE); 760 } else { 761 this._emitChars('</'); 762 this._reconsumeInState(RCDATA_STATE); 763 } 764 } 765 766 // RCDATA end tag name state 767 //------------------------------------------------------------------ 768 [RCDATA_END_TAG_NAME_STATE](cp) { 769 if (isAsciiUpper(cp)) { 770 this.currentToken.tagName += toAsciiLowerChar(cp); 771 this.tempBuff.push(cp); 772 } else if (isAsciiLower(cp)) { 773 this.currentToken.tagName += toChar(cp); 774 this.tempBuff.push(cp); 775 } else { 776 if (this.lastStartTagName === this.currentToken.tagName) { 777 if (isWhitespace(cp)) { 778 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 779 return; 780 } 781 782 if (cp === $.SOLIDUS) { 783 this.state = SELF_CLOSING_START_TAG_STATE; 784 return; 785 } 786 787 if (cp === $.GREATER_THAN_SIGN) { 788 this.state = DATA_STATE; 789 this._emitCurrentToken(); 790 return; 791 } 792 } 793 794 this._emitChars('</'); 795 this._emitSeveralCodePoints(this.tempBuff); 796 this._reconsumeInState(RCDATA_STATE); 797 } 798 } 799 800 // RAWTEXT less-than sign state 801 //------------------------------------------------------------------ 802 [RAWTEXT_LESS_THAN_SIGN_STATE](cp) { 803 if (cp === $.SOLIDUS) { 804 this.tempBuff = []; 805 this.state = RAWTEXT_END_TAG_OPEN_STATE; 806 } else { 807 this._emitChars('<'); 808 this._reconsumeInState(RAWTEXT_STATE); 809 } 810 } 811 812 // RAWTEXT end tag open state 813 //------------------------------------------------------------------ 814 [RAWTEXT_END_TAG_OPEN_STATE](cp) { 815 if (isAsciiLetter(cp)) { 816 this._createEndTagToken(); 817 this._reconsumeInState(RAWTEXT_END_TAG_NAME_STATE); 818 } else { 819 this._emitChars('</'); 820 this._reconsumeInState(RAWTEXT_STATE); 821 } 822 } 823 824 // RAWTEXT end tag name state 825 //------------------------------------------------------------------ 826 [RAWTEXT_END_TAG_NAME_STATE](cp) { 827 if (isAsciiUpper(cp)) { 828 this.currentToken.tagName += toAsciiLowerChar(cp); 829 this.tempBuff.push(cp); 830 } else if (isAsciiLower(cp)) { 831 this.currentToken.tagName += toChar(cp); 832 this.tempBuff.push(cp); 833 } else { 834 if (this.lastStartTagName === this.currentToken.tagName) { 835 if (isWhitespace(cp)) { 836 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 837 return; 838 } 839 840 if (cp === $.SOLIDUS) { 841 this.state = SELF_CLOSING_START_TAG_STATE; 842 return; 843 } 844 845 if (cp === $.GREATER_THAN_SIGN) { 846 this._emitCurrentToken(); 847 this.state = DATA_STATE; 848 return; 849 } 850 } 851 852 this._emitChars('</'); 853 this._emitSeveralCodePoints(this.tempBuff); 854 this._reconsumeInState(RAWTEXT_STATE); 855 } 856 } 857 858 // Script data less-than sign state 859 //------------------------------------------------------------------ 860 [SCRIPT_DATA_LESS_THAN_SIGN_STATE](cp) { 861 if (cp === $.SOLIDUS) { 862 this.tempBuff = []; 863 this.state = SCRIPT_DATA_END_TAG_OPEN_STATE; 864 } else if (cp === $.EXCLAMATION_MARK) { 865 this.state = SCRIPT_DATA_ESCAPE_START_STATE; 866 this._emitChars('<!'); 867 } else { 868 this._emitChars('<'); 869 this._reconsumeInState(SCRIPT_DATA_STATE); 870 } 871 } 872 873 // Script data end tag open state 874 //------------------------------------------------------------------ 875 [SCRIPT_DATA_END_TAG_OPEN_STATE](cp) { 876 if (isAsciiLetter(cp)) { 877 this._createEndTagToken(); 878 this._reconsumeInState(SCRIPT_DATA_END_TAG_NAME_STATE); 879 } else { 880 this._emitChars('</'); 881 this._reconsumeInState(SCRIPT_DATA_STATE); 882 } 883 } 884 885 // Script data end tag name state 886 //------------------------------------------------------------------ 887 [SCRIPT_DATA_END_TAG_NAME_STATE](cp) { 888 if (isAsciiUpper(cp)) { 889 this.currentToken.tagName += toAsciiLowerChar(cp); 890 this.tempBuff.push(cp); 891 } else if (isAsciiLower(cp)) { 892 this.currentToken.tagName += toChar(cp); 893 this.tempBuff.push(cp); 894 } else { 895 if (this.lastStartTagName === this.currentToken.tagName) { 896 if (isWhitespace(cp)) { 897 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 898 return; 899 } else if (cp === $.SOLIDUS) { 900 this.state = SELF_CLOSING_START_TAG_STATE; 901 return; 902 } else if (cp === $.GREATER_THAN_SIGN) { 903 this._emitCurrentToken(); 904 this.state = DATA_STATE; 905 return; 906 } 907 } 908 909 this._emitChars('</'); 910 this._emitSeveralCodePoints(this.tempBuff); 911 this._reconsumeInState(SCRIPT_DATA_STATE); 912 } 913 } 914 915 // Script data escape start state 916 //------------------------------------------------------------------ 917 [SCRIPT_DATA_ESCAPE_START_STATE](cp) { 918 if (cp === $.HYPHEN_MINUS) { 919 this.state = SCRIPT_DATA_ESCAPE_START_DASH_STATE; 920 this._emitChars('-'); 921 } else { 922 this._reconsumeInState(SCRIPT_DATA_STATE); 923 } 924 } 925 926 // Script data escape start dash state 927 //------------------------------------------------------------------ 928 [SCRIPT_DATA_ESCAPE_START_DASH_STATE](cp) { 929 if (cp === $.HYPHEN_MINUS) { 930 this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE; 931 this._emitChars('-'); 932 } else { 933 this._reconsumeInState(SCRIPT_DATA_STATE); 934 } 935 } 936 937 // Script data escaped state 938 //------------------------------------------------------------------ 939 [SCRIPT_DATA_ESCAPED_STATE](cp) { 940 if (cp === $.HYPHEN_MINUS) { 941 this.state = SCRIPT_DATA_ESCAPED_DASH_STATE; 942 this._emitChars('-'); 943 } else if (cp === $.LESS_THAN_SIGN) { 944 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; 945 } else if (cp === $.NULL) { 946 this._err(ERR.unexpectedNullCharacter); 947 this._emitChars(unicode.REPLACEMENT_CHARACTER); 948 } else if (cp === $.EOF) { 949 this._err(ERR.eofInScriptHtmlCommentLikeText); 950 this._emitEOFToken(); 951 } else { 952 this._emitCodePoint(cp); 953 } 954 } 955 956 // Script data escaped dash state 957 //------------------------------------------------------------------ 958 [SCRIPT_DATA_ESCAPED_DASH_STATE](cp) { 959 if (cp === $.HYPHEN_MINUS) { 960 this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE; 961 this._emitChars('-'); 962 } else if (cp === $.LESS_THAN_SIGN) { 963 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; 964 } else if (cp === $.NULL) { 965 this._err(ERR.unexpectedNullCharacter); 966 this.state = SCRIPT_DATA_ESCAPED_STATE; 967 this._emitChars(unicode.REPLACEMENT_CHARACTER); 968 } else if (cp === $.EOF) { 969 this._err(ERR.eofInScriptHtmlCommentLikeText); 970 this._emitEOFToken(); 971 } else { 972 this.state = SCRIPT_DATA_ESCAPED_STATE; 973 this._emitCodePoint(cp); 974 } 975 } 976 977 // Script data escaped dash dash state 978 //------------------------------------------------------------------ 979 [SCRIPT_DATA_ESCAPED_DASH_DASH_STATE](cp) { 980 if (cp === $.HYPHEN_MINUS) { 981 this._emitChars('-'); 982 } else if (cp === $.LESS_THAN_SIGN) { 983 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; 984 } else if (cp === $.GREATER_THAN_SIGN) { 985 this.state = SCRIPT_DATA_STATE; 986 this._emitChars('>'); 987 } else if (cp === $.NULL) { 988 this._err(ERR.unexpectedNullCharacter); 989 this.state = SCRIPT_DATA_ESCAPED_STATE; 990 this._emitChars(unicode.REPLACEMENT_CHARACTER); 991 } else if (cp === $.EOF) { 992 this._err(ERR.eofInScriptHtmlCommentLikeText); 993 this._emitEOFToken(); 994 } else { 995 this.state = SCRIPT_DATA_ESCAPED_STATE; 996 this._emitCodePoint(cp); 997 } 998 } 999 1000 // Script data escaped less-than sign state 1001 //------------------------------------------------------------------ 1002 [SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE](cp) { 1003 if (cp === $.SOLIDUS) { 1004 this.tempBuff = []; 1005 this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE; 1006 } else if (isAsciiLetter(cp)) { 1007 this.tempBuff = []; 1008 this._emitChars('<'); 1009 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE); 1010 } else { 1011 this._emitChars('<'); 1012 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1013 } 1014 } 1015 1016 // Script data escaped end tag open state 1017 //------------------------------------------------------------------ 1018 [SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE](cp) { 1019 if (isAsciiLetter(cp)) { 1020 this._createEndTagToken(); 1021 this._reconsumeInState(SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE); 1022 } else { 1023 this._emitChars('</'); 1024 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1025 } 1026 } 1027 1028 // Script data escaped end tag name state 1029 //------------------------------------------------------------------ 1030 [SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE](cp) { 1031 if (isAsciiUpper(cp)) { 1032 this.currentToken.tagName += toAsciiLowerChar(cp); 1033 this.tempBuff.push(cp); 1034 } else if (isAsciiLower(cp)) { 1035 this.currentToken.tagName += toChar(cp); 1036 this.tempBuff.push(cp); 1037 } else { 1038 if (this.lastStartTagName === this.currentToken.tagName) { 1039 if (isWhitespace(cp)) { 1040 this.state = BEFORE_ATTRIBUTE_NAME_STATE; 1041 return; 1042 } 1043 1044 if (cp === $.SOLIDUS) { 1045 this.state = SELF_CLOSING_START_TAG_STATE; 1046 return; 1047 } 1048 1049 if (cp === $.GREATER_THAN_SIGN) { 1050 this._emitCurrentToken(); 1051 this.state = DATA_STATE; 1052 return; 1053 } 1054 } 1055 1056 this._emitChars('</'); 1057 this._emitSeveralCodePoints(this.tempBuff); 1058 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1059 } 1060 } 1061 1062 // Script data double escape start state 1063 //------------------------------------------------------------------ 1064 [SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE](cp) { 1065 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) { 1066 this.state = this._isTempBufferEqualToScriptString() 1067 ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE 1068 : SCRIPT_DATA_ESCAPED_STATE; 1069 this._emitCodePoint(cp); 1070 } else if (isAsciiUpper(cp)) { 1071 this.tempBuff.push(toAsciiLowerCodePoint(cp)); 1072 this._emitCodePoint(cp); 1073 } else if (isAsciiLower(cp)) { 1074 this.tempBuff.push(cp); 1075 this._emitCodePoint(cp); 1076 } else { 1077 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); 1078 } 1079 } 1080 1081 // Script data double escaped state 1082 //------------------------------------------------------------------ 1083 [SCRIPT_DATA_DOUBLE_ESCAPED_STATE](cp) { 1084 if (cp === $.HYPHEN_MINUS) { 1085 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE; 1086 this._emitChars('-'); 1087 } else if (cp === $.LESS_THAN_SIGN) { 1088 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE; 1089 this._emitChars('<'); 1090 } else if (cp === $.NULL) { 1091 this._err(ERR.unexpectedNullCharacter); 1092 this._emitChars(unicode.REPLACEMENT_CHARACTER); 1093 } else if (cp === $.EOF) { 1094 this._err(ERR.eofInScriptHtmlCommentLikeText); 1095 this._emitEOFToken(); 1096 } else { 1097 this._emitCodePoint(cp); 1098 } 1099 } 1100 1101 // Script data double escaped dash state 1102 //------------------------------------------------------------------ 1103 [SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE](cp) { 1104 if (cp === $.HYPHEN_MINUS) { 1105 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE; 1106 this._emitChars('-'); 1107 } else if (cp === $.LESS_THAN_SIGN) { 1108 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE; 1109 this._emitChars('<'); 1110 } else if (cp === $.NULL) { 1111 this._err(ERR.unexpectedNullCharacter); 1112 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1113 this._emitChars(unicode.REPLACEMENT_CHARACTER); 1114 } else if (cp === $.EOF) { 1115 this._err(ERR.eofInScriptHtmlCommentLikeText); 1116 this._emitEOFToken(); 1117 } else { 1118 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1119 this._emitCodePoint(cp); 1120 } 1121 } 1122 1123 // Script data double escaped dash dash state 1124 //------------------------------------------------------------------ 1125 [SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE](cp) { 1126 if (cp === $.HYPHEN_MINUS) { 1127 this._emitChars('-'); 1128 } else if (cp === $.LESS_THAN_SIGN) { 1129 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE; 1130 this._emitChars('<'); 1131 } else if (cp === $.GREATER_THAN_SIGN) { 1132 this.state = SCRIPT_DATA_STATE; 1133 this._emitChars('>'); 1134 } else if (cp === $.NULL) { 1135 this._err(ERR.unexpectedNullCharacter); 1136 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1137 this._emitChars(unicode.REPLACEMENT_CHARACTER); 1138 } else if (cp === $.EOF) { 1139 this._err(ERR.eofInScriptHtmlCommentLikeText); 1140 this._emitEOFToken(); 1141 } else { 1142 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1143 this._emitCodePoint(cp); 1144 } 1145 } 1146 1147 // Script data double escaped less-than sign state 1148 //------------------------------------------------------------------ 1149 [SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE](cp) { 1150 if (cp === $.SOLIDUS) { 1151 this.tempBuff = []; 1152 this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE; 1153 this._emitChars('/'); 1154 } else { 1155 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE); 1156 } 1157 } 1158 1159 // Script data double escape end state 1160 //------------------------------------------------------------------ 1161 [SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE](cp) { 1162 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) { 1163 this.state = this._isTempBufferEqualToScriptString() 1164 ? SCRIPT_DATA_ESCAPED_STATE 1165 : SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1166 1167 this._emitCodePoint(cp); 1168 } else if (isAsciiUpper(cp)) { 1169 this.tempBuff.push(toAsciiLowerCodePoint(cp)); 1170 this._emitCodePoint(cp); 1171 } else if (isAsciiLower(cp)) { 1172 this.tempBuff.push(cp); 1173 this._emitCodePoint(cp); 1174 } else { 1175 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE); 1176 } 1177 } 1178 1179 // Before attribute name state 1180 //------------------------------------------------------------------ 1181 [BEFORE_ATTRIBUTE_NAME_STATE](cp) { 1182 if (isWhitespace(cp)) { 1183 return; 1184 } 1185 1186 if (cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) { 1187 this._reconsumeInState(AFTER_ATTRIBUTE_NAME_STATE); 1188 } else if (cp === $.EQUALS_SIGN) { 1189 this._err(ERR.unexpectedEqualsSignBeforeAttributeName); 1190 this._createAttr('='); 1191 this.state = ATTRIBUTE_NAME_STATE; 1192 } else { 1193 this._createAttr(''); 1194 this._reconsumeInState(ATTRIBUTE_NAME_STATE); 1195 } 1196 } 1197 1198 // Attribute name state 1199 //------------------------------------------------------------------ 1200 [ATTRIBUTE_NAME_STATE](cp) { 1201 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) { 1202 this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE); 1203 this._unconsume(); 1204 } else if (cp === $.EQUALS_SIGN) { 1205 this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE); 1206 } else if (isAsciiUpper(cp)) { 1207 this.currentAttr.name += toAsciiLowerChar(cp); 1208 } else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN) { 1209 this._err(ERR.unexpectedCharacterInAttributeName); 1210 this.currentAttr.name += toChar(cp); 1211 } else if (cp === $.NULL) { 1212 this._err(ERR.unexpectedNullCharacter); 1213 this.currentAttr.name += unicode.REPLACEMENT_CHARACTER; 1214 } else { 1215 this.currentAttr.name += toChar(cp); 1216 } 1217 } 1218 1219 // After attribute name state 1220 //------------------------------------------------------------------ 1221 [AFTER_ATTRIBUTE_NAME_STATE](cp) { 1222 if (isWhitespace(cp)) { 1223 return; 1224 } 1225 1226 if (cp === $.SOLIDUS) { 1227 this.state = SELF_CLOSING_START_TAG_STATE; 1228 } else if (cp === $.EQUALS_SIGN) { 1229 this.state = BEFORE_ATTRIBUTE_VALUE_STATE; 1230 } else if (cp === $.GREATER_THAN_SIGN) { 1231 this.state = DATA_STATE; 1232 this._emitCurrentToken(); 1233 } else if (cp === $.EOF) { 1234 this._err(ERR.eofInTag); 1235 this._emitEOFToken(); 1236 } else { 1237 this._createAttr(''); 1238 this._reconsumeInState(ATTRIBUTE_NAME_STATE); 1239 } 1240 } 1241 1242 // Before attribute value state 1243 //------------------------------------------------------------------ 1244 [BEFORE_ATTRIBUTE_VALUE_STATE](cp) { 1245 if (isWhitespace(cp)) { 1246 return; 1247 } 1248 1249 if (cp === $.QUOTATION_MARK) { 1250 this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 1251 } else if (cp === $.APOSTROPHE) { 1252 this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 1253 } else if (cp === $.GREATER_THAN_SIGN) { 1254 this._err(ERR.missingAttributeValue); 1255 this.state = DATA_STATE; 1256 this._emitCurrentToken(); 1257 } else { 1258 this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE); 1259 } 1260 } 1261 1262 // Attribute value (double-quoted) state 1263 //------------------------------------------------------------------ 1264 [ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE](cp) { 1265 if (cp === $.QUOTATION_MARK) { 1266 this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; 1267 } else if (cp === $.AMPERSAND) { 1268 this.returnState = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 1269 this.state = CHARACTER_REFERENCE_STATE; 1270 } else if (cp === $.NULL) { 1271 this._err(ERR.unexpectedNullCharacter); 1272 this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; 1273 } else if (cp === $.EOF) { 1274 this._err(ERR.eofInTag); 1275 this._emitEOFToken(); 1276 } else { 1277 this.currentAttr.value += toChar(cp); 1278 } 1279 } 1280 1281 // Attribute value (single-quoted) state 1282 //------------------------------------------------------------------ 1283 [ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE](cp) { 1284 if (cp === $.APOSTROPHE) { 1285 this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; 1286 } else if (cp === $.AMPERSAND) { 1287 this.returnState = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 1288 this.state = CHARACTER_REFERENCE_STATE; 1289 } else if (cp === $.NULL) { 1290 this._err(ERR.unexpectedNullCharacter); 1291 this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; 1292 } else if (cp === $.EOF) { 1293 this._err(ERR.eofInTag); 1294 this._emitEOFToken(); 1295 } else { 1296 this.currentAttr.value += toChar(cp); 1297 } 1298 } 1299 1300 // Attribute value (unquoted) state 1301 //------------------------------------------------------------------ 1302 [ATTRIBUTE_VALUE_UNQUOTED_STATE](cp) { 1303 if (isWhitespace(cp)) { 1304 this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE); 1305 } else if (cp === $.AMPERSAND) { 1306 this.returnState = ATTRIBUTE_VALUE_UNQUOTED_STATE; 1307 this.state = CHARACTER_REFERENCE_STATE; 1308 } else if (cp === $.GREATER_THAN_SIGN) { 1309 this._leaveAttrValue(DATA_STATE); 1310 this._emitCurrentToken(); 1311 } else if (cp === $.NULL) { 1312 this._err(ERR.unexpectedNullCharacter); 1313 this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; 1314 } else if ( 1315 cp === $.QUOTATION_MARK || 1316 cp === $.APOSTROPHE || 1317 cp === $.LESS_THAN_SIGN || 1318 cp === $.EQUALS_SIGN || 1319 cp === $.GRAVE_ACCENT 1320 ) { 1321 this._err(ERR.unexpectedCharacterInUnquotedAttributeValue); 1322 this.currentAttr.value += toChar(cp); 1323 } else if (cp === $.EOF) { 1324 this._err(ERR.eofInTag); 1325 this._emitEOFToken(); 1326 } else { 1327 this.currentAttr.value += toChar(cp); 1328 } 1329 } 1330 1331 // After attribute value (quoted) state 1332 //------------------------------------------------------------------ 1333 [AFTER_ATTRIBUTE_VALUE_QUOTED_STATE](cp) { 1334 if (isWhitespace(cp)) { 1335 this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE); 1336 } else if (cp === $.SOLIDUS) { 1337 this._leaveAttrValue(SELF_CLOSING_START_TAG_STATE); 1338 } else if (cp === $.GREATER_THAN_SIGN) { 1339 this._leaveAttrValue(DATA_STATE); 1340 this._emitCurrentToken(); 1341 } else if (cp === $.EOF) { 1342 this._err(ERR.eofInTag); 1343 this._emitEOFToken(); 1344 } else { 1345 this._err(ERR.missingWhitespaceBetweenAttributes); 1346 this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE); 1347 } 1348 } 1349 1350 // Self-closing start tag state 1351 //------------------------------------------------------------------ 1352 [SELF_CLOSING_START_TAG_STATE](cp) { 1353 if (cp === $.GREATER_THAN_SIGN) { 1354 this.currentToken.selfClosing = true; 1355 this.state = DATA_STATE; 1356 this._emitCurrentToken(); 1357 } else if (cp === $.EOF) { 1358 this._err(ERR.eofInTag); 1359 this._emitEOFToken(); 1360 } else { 1361 this._err(ERR.unexpectedSolidusInTag); 1362 this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE); 1363 } 1364 } 1365 1366 // Bogus comment state 1367 //------------------------------------------------------------------ 1368 [BOGUS_COMMENT_STATE](cp) { 1369 if (cp === $.GREATER_THAN_SIGN) { 1370 this.state = DATA_STATE; 1371 this._emitCurrentToken(); 1372 } else if (cp === $.EOF) { 1373 this._emitCurrentToken(); 1374 this._emitEOFToken(); 1375 } else if (cp === $.NULL) { 1376 this._err(ERR.unexpectedNullCharacter); 1377 this.currentToken.data += unicode.REPLACEMENT_CHARACTER; 1378 } else { 1379 this.currentToken.data += toChar(cp); 1380 } 1381 } 1382 1383 // Markup declaration open state 1384 //------------------------------------------------------------------ 1385 [MARKUP_DECLARATION_OPEN_STATE](cp) { 1386 if (this._consumeSequenceIfMatch($$.DASH_DASH_STRING, cp, true)) { 1387 this._createCommentToken(); 1388 this.state = COMMENT_START_STATE; 1389 } else if (this._consumeSequenceIfMatch($$.DOCTYPE_STRING, cp, false)) { 1390 this.state = DOCTYPE_STATE; 1391 } else if (this._consumeSequenceIfMatch($$.CDATA_START_STRING, cp, true)) { 1392 if (this.allowCDATA) { 1393 this.state = CDATA_SECTION_STATE; 1394 } else { 1395 this._err(ERR.cdataInHtmlContent); 1396 this._createCommentToken(); 1397 this.currentToken.data = '[CDATA['; 1398 this.state = BOGUS_COMMENT_STATE; 1399 } 1400 } 1401 1402 //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup 1403 //results are no longer valid and we will need to start over. 1404 else if (!this._ensureHibernation()) { 1405 this._err(ERR.incorrectlyOpenedComment); 1406 this._createCommentToken(); 1407 this._reconsumeInState(BOGUS_COMMENT_STATE); 1408 } 1409 } 1410 1411 // Comment start state 1412 //------------------------------------------------------------------ 1413 [COMMENT_START_STATE](cp) { 1414 if (cp === $.HYPHEN_MINUS) { 1415 this.state = COMMENT_START_DASH_STATE; 1416 } else if (cp === $.GREATER_THAN_SIGN) { 1417 this._err(ERR.abruptClosingOfEmptyComment); 1418 this.state = DATA_STATE; 1419 this._emitCurrentToken(); 1420 } else { 1421 this._reconsumeInState(COMMENT_STATE); 1422 } 1423 } 1424 1425 // Comment start dash state 1426 //------------------------------------------------------------------ 1427 [COMMENT_START_DASH_STATE](cp) { 1428 if (cp === $.HYPHEN_MINUS) { 1429 this.state = COMMENT_END_STATE; 1430 } else if (cp === $.GREATER_THAN_SIGN) { 1431 this._err(ERR.abruptClosingOfEmptyComment); 1432 this.state = DATA_STATE; 1433 this._emitCurrentToken(); 1434 } else if (cp === $.EOF) { 1435 this._err(ERR.eofInComment); 1436 this._emitCurrentToken(); 1437 this._emitEOFToken(); 1438 } else { 1439 this.currentToken.data += '-'; 1440 this._reconsumeInState(COMMENT_STATE); 1441 } 1442 } 1443 1444 // Comment state 1445 //------------------------------------------------------------------ 1446 [COMMENT_STATE](cp) { 1447 if (cp === $.HYPHEN_MINUS) { 1448 this.state = COMMENT_END_DASH_STATE; 1449 } else if (cp === $.LESS_THAN_SIGN) { 1450 this.currentToken.data += '<'; 1451 this.state = COMMENT_LESS_THAN_SIGN_STATE; 1452 } else if (cp === $.NULL) { 1453 this._err(ERR.unexpectedNullCharacter); 1454 this.currentToken.data += unicode.REPLACEMENT_CHARACTER; 1455 } else if (cp === $.EOF) { 1456 this._err(ERR.eofInComment); 1457 this._emitCurrentToken(); 1458 this._emitEOFToken(); 1459 } else { 1460 this.currentToken.data += toChar(cp); 1461 } 1462 } 1463 1464 // Comment less-than sign state 1465 //------------------------------------------------------------------ 1466 [COMMENT_LESS_THAN_SIGN_STATE](cp) { 1467 if (cp === $.EXCLAMATION_MARK) { 1468 this.currentToken.data += '!'; 1469 this.state = COMMENT_LESS_THAN_SIGN_BANG_STATE; 1470 } else if (cp === $.LESS_THAN_SIGN) { 1471 this.currentToken.data += '!'; 1472 } else { 1473 this._reconsumeInState(COMMENT_STATE); 1474 } 1475 } 1476 1477 // Comment less-than sign bang state 1478 //------------------------------------------------------------------ 1479 [COMMENT_LESS_THAN_SIGN_BANG_STATE](cp) { 1480 if (cp === $.HYPHEN_MINUS) { 1481 this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE; 1482 } else { 1483 this._reconsumeInState(COMMENT_STATE); 1484 } 1485 } 1486 1487 // Comment less-than sign bang dash state 1488 //------------------------------------------------------------------ 1489 [COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE](cp) { 1490 if (cp === $.HYPHEN_MINUS) { 1491 this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE; 1492 } else { 1493 this._reconsumeInState(COMMENT_END_DASH_STATE); 1494 } 1495 } 1496 1497 // Comment less-than sign bang dash dash state 1498 //------------------------------------------------------------------ 1499 [COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE](cp) { 1500 if (cp !== $.GREATER_THAN_SIGN && cp !== $.EOF) { 1501 this._err(ERR.nestedComment); 1502 } 1503 1504 this._reconsumeInState(COMMENT_END_STATE); 1505 } 1506 1507 // Comment end dash state 1508 //------------------------------------------------------------------ 1509 [COMMENT_END_DASH_STATE](cp) { 1510 if (cp === $.HYPHEN_MINUS) { 1511 this.state = COMMENT_END_STATE; 1512 } else if (cp === $.EOF) { 1513 this._err(ERR.eofInComment); 1514 this._emitCurrentToken(); 1515 this._emitEOFToken(); 1516 } else { 1517 this.currentToken.data += '-'; 1518 this._reconsumeInState(COMMENT_STATE); 1519 } 1520 } 1521 1522 // Comment end state 1523 //------------------------------------------------------------------ 1524 [COMMENT_END_STATE](cp) { 1525 if (cp === $.GREATER_THAN_SIGN) { 1526 this.state = DATA_STATE; 1527 this._emitCurrentToken(); 1528 } else if (cp === $.EXCLAMATION_MARK) { 1529 this.state = COMMENT_END_BANG_STATE; 1530 } else if (cp === $.HYPHEN_MINUS) { 1531 this.currentToken.data += '-'; 1532 } else if (cp === $.EOF) { 1533 this._err(ERR.eofInComment); 1534 this._emitCurrentToken(); 1535 this._emitEOFToken(); 1536 } else { 1537 this.currentToken.data += '--'; 1538 this._reconsumeInState(COMMENT_STATE); 1539 } 1540 } 1541 1542 // Comment end bang state 1543 //------------------------------------------------------------------ 1544 [COMMENT_END_BANG_STATE](cp) { 1545 if (cp === $.HYPHEN_MINUS) { 1546 this.currentToken.data += '--!'; 1547 this.state = COMMENT_END_DASH_STATE; 1548 } else if (cp === $.GREATER_THAN_SIGN) { 1549 this._err(ERR.incorrectlyClosedComment); 1550 this.state = DATA_STATE; 1551 this._emitCurrentToken(); 1552 } else if (cp === $.EOF) { 1553 this._err(ERR.eofInComment); 1554 this._emitCurrentToken(); 1555 this._emitEOFToken(); 1556 } else { 1557 this.currentToken.data += '--!'; 1558 this._reconsumeInState(COMMENT_STATE); 1559 } 1560 } 1561 1562 // DOCTYPE state 1563 //------------------------------------------------------------------ 1564 [DOCTYPE_STATE](cp) { 1565 if (isWhitespace(cp)) { 1566 this.state = BEFORE_DOCTYPE_NAME_STATE; 1567 } else if (cp === $.GREATER_THAN_SIGN) { 1568 this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE); 1569 } else if (cp === $.EOF) { 1570 this._err(ERR.eofInDoctype); 1571 this._createDoctypeToken(null); 1572 this.currentToken.forceQuirks = true; 1573 this._emitCurrentToken(); 1574 this._emitEOFToken(); 1575 } else { 1576 this._err(ERR.missingWhitespaceBeforeDoctypeName); 1577 this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE); 1578 } 1579 } 1580 1581 // Before DOCTYPE name state 1582 //------------------------------------------------------------------ 1583 [BEFORE_DOCTYPE_NAME_STATE](cp) { 1584 if (isWhitespace(cp)) { 1585 return; 1586 } 1587 1588 if (isAsciiUpper(cp)) { 1589 this._createDoctypeToken(toAsciiLowerChar(cp)); 1590 this.state = DOCTYPE_NAME_STATE; 1591 } else if (cp === $.NULL) { 1592 this._err(ERR.unexpectedNullCharacter); 1593 this._createDoctypeToken(unicode.REPLACEMENT_CHARACTER); 1594 this.state = DOCTYPE_NAME_STATE; 1595 } else if (cp === $.GREATER_THAN_SIGN) { 1596 this._err(ERR.missingDoctypeName); 1597 this._createDoctypeToken(null); 1598 this.currentToken.forceQuirks = true; 1599 this._emitCurrentToken(); 1600 this.state = DATA_STATE; 1601 } else if (cp === $.EOF) { 1602 this._err(ERR.eofInDoctype); 1603 this._createDoctypeToken(null); 1604 this.currentToken.forceQuirks = true; 1605 this._emitCurrentToken(); 1606 this._emitEOFToken(); 1607 } else { 1608 this._createDoctypeToken(toChar(cp)); 1609 this.state = DOCTYPE_NAME_STATE; 1610 } 1611 } 1612 1613 // DOCTYPE name state 1614 //------------------------------------------------------------------ 1615 [DOCTYPE_NAME_STATE](cp) { 1616 if (isWhitespace(cp)) { 1617 this.state = AFTER_DOCTYPE_NAME_STATE; 1618 } else if (cp === $.GREATER_THAN_SIGN) { 1619 this.state = DATA_STATE; 1620 this._emitCurrentToken(); 1621 } else if (isAsciiUpper(cp)) { 1622 this.currentToken.name += toAsciiLowerChar(cp); 1623 } else if (cp === $.NULL) { 1624 this._err(ERR.unexpectedNullCharacter); 1625 this.currentToken.name += unicode.REPLACEMENT_CHARACTER; 1626 } else if (cp === $.EOF) { 1627 this._err(ERR.eofInDoctype); 1628 this.currentToken.forceQuirks = true; 1629 this._emitCurrentToken(); 1630 this._emitEOFToken(); 1631 } else { 1632 this.currentToken.name += toChar(cp); 1633 } 1634 } 1635 1636 // After DOCTYPE name state 1637 //------------------------------------------------------------------ 1638 [AFTER_DOCTYPE_NAME_STATE](cp) { 1639 if (isWhitespace(cp)) { 1640 return; 1641 } 1642 1643 if (cp === $.GREATER_THAN_SIGN) { 1644 this.state = DATA_STATE; 1645 this._emitCurrentToken(); 1646 } else if (cp === $.EOF) { 1647 this._err(ERR.eofInDoctype); 1648 this.currentToken.forceQuirks = true; 1649 this._emitCurrentToken(); 1650 this._emitEOFToken(); 1651 } else if (this._consumeSequenceIfMatch($$.PUBLIC_STRING, cp, false)) { 1652 this.state = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE; 1653 } else if (this._consumeSequenceIfMatch($$.SYSTEM_STRING, cp, false)) { 1654 this.state = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE; 1655 } 1656 //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup 1657 //results are no longer valid and we will need to start over. 1658 else if (!this._ensureHibernation()) { 1659 this._err(ERR.invalidCharacterSequenceAfterDoctypeName); 1660 this.currentToken.forceQuirks = true; 1661 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1662 } 1663 } 1664 1665 // After DOCTYPE public keyword state 1666 //------------------------------------------------------------------ 1667 [AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE](cp) { 1668 if (isWhitespace(cp)) { 1669 this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 1670 } else if (cp === $.QUOTATION_MARK) { 1671 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); 1672 this.currentToken.publicId = ''; 1673 this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; 1674 } else if (cp === $.APOSTROPHE) { 1675 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); 1676 this.currentToken.publicId = ''; 1677 this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; 1678 } else if (cp === $.GREATER_THAN_SIGN) { 1679 this._err(ERR.missingDoctypePublicIdentifier); 1680 this.currentToken.forceQuirks = true; 1681 this.state = DATA_STATE; 1682 this._emitCurrentToken(); 1683 } else if (cp === $.EOF) { 1684 this._err(ERR.eofInDoctype); 1685 this.currentToken.forceQuirks = true; 1686 this._emitCurrentToken(); 1687 this._emitEOFToken(); 1688 } else { 1689 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); 1690 this.currentToken.forceQuirks = true; 1691 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1692 } 1693 } 1694 1695 // Before DOCTYPE public identifier state 1696 //------------------------------------------------------------------ 1697 [BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) { 1698 if (isWhitespace(cp)) { 1699 return; 1700 } 1701 1702 if (cp === $.QUOTATION_MARK) { 1703 this.currentToken.publicId = ''; 1704 this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; 1705 } else if (cp === $.APOSTROPHE) { 1706 this.currentToken.publicId = ''; 1707 this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; 1708 } else if (cp === $.GREATER_THAN_SIGN) { 1709 this._err(ERR.missingDoctypePublicIdentifier); 1710 this.currentToken.forceQuirks = true; 1711 this.state = DATA_STATE; 1712 this._emitCurrentToken(); 1713 } else if (cp === $.EOF) { 1714 this._err(ERR.eofInDoctype); 1715 this.currentToken.forceQuirks = true; 1716 this._emitCurrentToken(); 1717 this._emitEOFToken(); 1718 } else { 1719 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); 1720 this.currentToken.forceQuirks = true; 1721 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1722 } 1723 } 1724 1725 // DOCTYPE public identifier (double-quoted) state 1726 //------------------------------------------------------------------ 1727 [DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) { 1728 if (cp === $.QUOTATION_MARK) { 1729 this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 1730 } else if (cp === $.NULL) { 1731 this._err(ERR.unexpectedNullCharacter); 1732 this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER; 1733 } else if (cp === $.GREATER_THAN_SIGN) { 1734 this._err(ERR.abruptDoctypePublicIdentifier); 1735 this.currentToken.forceQuirks = true; 1736 this._emitCurrentToken(); 1737 this.state = DATA_STATE; 1738 } else if (cp === $.EOF) { 1739 this._err(ERR.eofInDoctype); 1740 this.currentToken.forceQuirks = true; 1741 this._emitCurrentToken(); 1742 this._emitEOFToken(); 1743 } else { 1744 this.currentToken.publicId += toChar(cp); 1745 } 1746 } 1747 1748 // DOCTYPE public identifier (single-quoted) state 1749 //------------------------------------------------------------------ 1750 [DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE](cp) { 1751 if (cp === $.APOSTROPHE) { 1752 this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 1753 } else if (cp === $.NULL) { 1754 this._err(ERR.unexpectedNullCharacter); 1755 this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER; 1756 } else if (cp === $.GREATER_THAN_SIGN) { 1757 this._err(ERR.abruptDoctypePublicIdentifier); 1758 this.currentToken.forceQuirks = true; 1759 this._emitCurrentToken(); 1760 this.state = DATA_STATE; 1761 } else if (cp === $.EOF) { 1762 this._err(ERR.eofInDoctype); 1763 this.currentToken.forceQuirks = true; 1764 this._emitCurrentToken(); 1765 this._emitEOFToken(); 1766 } else { 1767 this.currentToken.publicId += toChar(cp); 1768 } 1769 } 1770 1771 // After DOCTYPE public identifier state 1772 //------------------------------------------------------------------ 1773 [AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) { 1774 if (isWhitespace(cp)) { 1775 this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE; 1776 } else if (cp === $.GREATER_THAN_SIGN) { 1777 this.state = DATA_STATE; 1778 this._emitCurrentToken(); 1779 } else if (cp === $.QUOTATION_MARK) { 1780 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); 1781 this.currentToken.systemId = ''; 1782 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 1783 } else if (cp === $.APOSTROPHE) { 1784 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); 1785 this.currentToken.systemId = ''; 1786 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 1787 } else if (cp === $.EOF) { 1788 this._err(ERR.eofInDoctype); 1789 this.currentToken.forceQuirks = true; 1790 this._emitCurrentToken(); 1791 this._emitEOFToken(); 1792 } else { 1793 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 1794 this.currentToken.forceQuirks = true; 1795 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1796 } 1797 } 1798 1799 // Between DOCTYPE public and system identifiers state 1800 //------------------------------------------------------------------ 1801 [BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE](cp) { 1802 if (isWhitespace(cp)) { 1803 return; 1804 } 1805 1806 if (cp === $.GREATER_THAN_SIGN) { 1807 this._emitCurrentToken(); 1808 this.state = DATA_STATE; 1809 } else if (cp === $.QUOTATION_MARK) { 1810 this.currentToken.systemId = ''; 1811 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 1812 } else if (cp === $.APOSTROPHE) { 1813 this.currentToken.systemId = ''; 1814 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 1815 } else if (cp === $.EOF) { 1816 this._err(ERR.eofInDoctype); 1817 this.currentToken.forceQuirks = true; 1818 this._emitCurrentToken(); 1819 this._emitEOFToken(); 1820 } else { 1821 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 1822 this.currentToken.forceQuirks = true; 1823 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1824 } 1825 } 1826 1827 // After DOCTYPE system keyword state 1828 //------------------------------------------------------------------ 1829 [AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE](cp) { 1830 if (isWhitespace(cp)) { 1831 this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 1832 } else if (cp === $.QUOTATION_MARK) { 1833 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); 1834 this.currentToken.systemId = ''; 1835 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 1836 } else if (cp === $.APOSTROPHE) { 1837 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); 1838 this.currentToken.systemId = ''; 1839 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 1840 } else if (cp === $.GREATER_THAN_SIGN) { 1841 this._err(ERR.missingDoctypeSystemIdentifier); 1842 this.currentToken.forceQuirks = true; 1843 this.state = DATA_STATE; 1844 this._emitCurrentToken(); 1845 } else if (cp === $.EOF) { 1846 this._err(ERR.eofInDoctype); 1847 this.currentToken.forceQuirks = true; 1848 this._emitCurrentToken(); 1849 this._emitEOFToken(); 1850 } else { 1851 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 1852 this.currentToken.forceQuirks = true; 1853 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1854 } 1855 } 1856 1857 // Before DOCTYPE system identifier state 1858 //------------------------------------------------------------------ 1859 [BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) { 1860 if (isWhitespace(cp)) { 1861 return; 1862 } 1863 1864 if (cp === $.QUOTATION_MARK) { 1865 this.currentToken.systemId = ''; 1866 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 1867 } else if (cp === $.APOSTROPHE) { 1868 this.currentToken.systemId = ''; 1869 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 1870 } else if (cp === $.GREATER_THAN_SIGN) { 1871 this._err(ERR.missingDoctypeSystemIdentifier); 1872 this.currentToken.forceQuirks = true; 1873 this.state = DATA_STATE; 1874 this._emitCurrentToken(); 1875 } else if (cp === $.EOF) { 1876 this._err(ERR.eofInDoctype); 1877 this.currentToken.forceQuirks = true; 1878 this._emitCurrentToken(); 1879 this._emitEOFToken(); 1880 } else { 1881 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 1882 this.currentToken.forceQuirks = true; 1883 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1884 } 1885 } 1886 1887 // DOCTYPE system identifier (double-quoted) state 1888 //------------------------------------------------------------------ 1889 [DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) { 1890 if (cp === $.QUOTATION_MARK) { 1891 this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 1892 } else if (cp === $.NULL) { 1893 this._err(ERR.unexpectedNullCharacter); 1894 this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER; 1895 } else if (cp === $.GREATER_THAN_SIGN) { 1896 this._err(ERR.abruptDoctypeSystemIdentifier); 1897 this.currentToken.forceQuirks = true; 1898 this._emitCurrentToken(); 1899 this.state = DATA_STATE; 1900 } else if (cp === $.EOF) { 1901 this._err(ERR.eofInDoctype); 1902 this.currentToken.forceQuirks = true; 1903 this._emitCurrentToken(); 1904 this._emitEOFToken(); 1905 } else { 1906 this.currentToken.systemId += toChar(cp); 1907 } 1908 } 1909 1910 // DOCTYPE system identifier (single-quoted) state 1911 //------------------------------------------------------------------ 1912 [DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE](cp) { 1913 if (cp === $.APOSTROPHE) { 1914 this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 1915 } else if (cp === $.NULL) { 1916 this._err(ERR.unexpectedNullCharacter); 1917 this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER; 1918 } else if (cp === $.GREATER_THAN_SIGN) { 1919 this._err(ERR.abruptDoctypeSystemIdentifier); 1920 this.currentToken.forceQuirks = true; 1921 this._emitCurrentToken(); 1922 this.state = DATA_STATE; 1923 } else if (cp === $.EOF) { 1924 this._err(ERR.eofInDoctype); 1925 this.currentToken.forceQuirks = true; 1926 this._emitCurrentToken(); 1927 this._emitEOFToken(); 1928 } else { 1929 this.currentToken.systemId += toChar(cp); 1930 } 1931 } 1932 1933 // After DOCTYPE system identifier state 1934 //------------------------------------------------------------------ 1935 [AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) { 1936 if (isWhitespace(cp)) { 1937 return; 1938 } 1939 1940 if (cp === $.GREATER_THAN_SIGN) { 1941 this._emitCurrentToken(); 1942 this.state = DATA_STATE; 1943 } else if (cp === $.EOF) { 1944 this._err(ERR.eofInDoctype); 1945 this.currentToken.forceQuirks = true; 1946 this._emitCurrentToken(); 1947 this._emitEOFToken(); 1948 } else { 1949 this._err(ERR.unexpectedCharacterAfterDoctypeSystemIdentifier); 1950 this._reconsumeInState(BOGUS_DOCTYPE_STATE); 1951 } 1952 } 1953 1954 // Bogus DOCTYPE state 1955 //------------------------------------------------------------------ 1956 [BOGUS_DOCTYPE_STATE](cp) { 1957 if (cp === $.GREATER_THAN_SIGN) { 1958 this._emitCurrentToken(); 1959 this.state = DATA_STATE; 1960 } else if (cp === $.NULL) { 1961 this._err(ERR.unexpectedNullCharacter); 1962 } else if (cp === $.EOF) { 1963 this._emitCurrentToken(); 1964 this._emitEOFToken(); 1965 } 1966 } 1967 1968 // CDATA section state 1969 //------------------------------------------------------------------ 1970 [CDATA_SECTION_STATE](cp) { 1971 if (cp === $.RIGHT_SQUARE_BRACKET) { 1972 this.state = CDATA_SECTION_BRACKET_STATE; 1973 } else if (cp === $.EOF) { 1974 this._err(ERR.eofInCdata); 1975 this._emitEOFToken(); 1976 } else { 1977 this._emitCodePoint(cp); 1978 } 1979 } 1980 1981 // CDATA section bracket state 1982 //------------------------------------------------------------------ 1983 [CDATA_SECTION_BRACKET_STATE](cp) { 1984 if (cp === $.RIGHT_SQUARE_BRACKET) { 1985 this.state = CDATA_SECTION_END_STATE; 1986 } else { 1987 this._emitChars(']'); 1988 this._reconsumeInState(CDATA_SECTION_STATE); 1989 } 1990 } 1991 1992 // CDATA section end state 1993 //------------------------------------------------------------------ 1994 [CDATA_SECTION_END_STATE](cp) { 1995 if (cp === $.GREATER_THAN_SIGN) { 1996 this.state = DATA_STATE; 1997 } else if (cp === $.RIGHT_SQUARE_BRACKET) { 1998 this._emitChars(']'); 1999 } else { 2000 this._emitChars(']]'); 2001 this._reconsumeInState(CDATA_SECTION_STATE); 2002 } 2003 } 2004 2005 // Character reference state 2006 //------------------------------------------------------------------ 2007 [CHARACTER_REFERENCE_STATE](cp) { 2008 this.tempBuff = [$.AMPERSAND]; 2009 2010 if (cp === $.NUMBER_SIGN) { 2011 this.tempBuff.push(cp); 2012 this.state = NUMERIC_CHARACTER_REFERENCE_STATE; 2013 } else if (isAsciiAlphaNumeric(cp)) { 2014 this._reconsumeInState(NAMED_CHARACTER_REFERENCE_STATE); 2015 } else { 2016 this._flushCodePointsConsumedAsCharacterReference(); 2017 this._reconsumeInState(this.returnState); 2018 } 2019 } 2020 2021 // Named character reference state 2022 //------------------------------------------------------------------ 2023 [NAMED_CHARACTER_REFERENCE_STATE](cp) { 2024 const matchResult = this._matchNamedCharacterReference(cp); 2025 2026 //NOTE: matching can be abrupted by hibernation. In that case match 2027 //results are no longer valid and we will need to start over. 2028 if (this._ensureHibernation()) { 2029 this.tempBuff = [$.AMPERSAND]; 2030 } else if (matchResult) { 2031 const withSemicolon = this.tempBuff[this.tempBuff.length - 1] === $.SEMICOLON; 2032 2033 if (!this._isCharacterReferenceAttributeQuirk(withSemicolon)) { 2034 if (!withSemicolon) { 2035 this._errOnNextCodePoint(ERR.missingSemicolonAfterCharacterReference); 2036 } 2037 2038 this.tempBuff = matchResult; 2039 } 2040 2041 this._flushCodePointsConsumedAsCharacterReference(); 2042 this.state = this.returnState; 2043 } else { 2044 this._flushCodePointsConsumedAsCharacterReference(); 2045 this.state = AMBIGUOUS_AMPERSAND_STATE; 2046 } 2047 } 2048 2049 // Ambiguos ampersand state 2050 //------------------------------------------------------------------ 2051 [AMBIGUOUS_AMPERSAND_STATE](cp) { 2052 if (isAsciiAlphaNumeric(cp)) { 2053 if (this._isCharacterReferenceInAttribute()) { 2054 this.currentAttr.value += toChar(cp); 2055 } else { 2056 this._emitCodePoint(cp); 2057 } 2058 } else { 2059 if (cp === $.SEMICOLON) { 2060 this._err(ERR.unknownNamedCharacterReference); 2061 } 2062 2063 this._reconsumeInState(this.returnState); 2064 } 2065 } 2066 2067 // Numeric character reference state 2068 //------------------------------------------------------------------ 2069 [NUMERIC_CHARACTER_REFERENCE_STATE](cp) { 2070 this.charRefCode = 0; 2071 2072 if (cp === $.LATIN_SMALL_X || cp === $.LATIN_CAPITAL_X) { 2073 this.tempBuff.push(cp); 2074 this.state = HEXADEMICAL_CHARACTER_REFERENCE_START_STATE; 2075 } else { 2076 this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_START_STATE); 2077 } 2078 } 2079 2080 // Hexademical character reference start state 2081 //------------------------------------------------------------------ 2082 [HEXADEMICAL_CHARACTER_REFERENCE_START_STATE](cp) { 2083 if (isAsciiHexDigit(cp)) { 2084 this._reconsumeInState(HEXADEMICAL_CHARACTER_REFERENCE_STATE); 2085 } else { 2086 this._err(ERR.absenceOfDigitsInNumericCharacterReference); 2087 this._flushCodePointsConsumedAsCharacterReference(); 2088 this._reconsumeInState(this.returnState); 2089 } 2090 } 2091 2092 // Decimal character reference start state 2093 //------------------------------------------------------------------ 2094 [DECIMAL_CHARACTER_REFERENCE_START_STATE](cp) { 2095 if (isAsciiDigit(cp)) { 2096 this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_STATE); 2097 } else { 2098 this._err(ERR.absenceOfDigitsInNumericCharacterReference); 2099 this._flushCodePointsConsumedAsCharacterReference(); 2100 this._reconsumeInState(this.returnState); 2101 } 2102 } 2103 2104 // Hexademical character reference state 2105 //------------------------------------------------------------------ 2106 [HEXADEMICAL_CHARACTER_REFERENCE_STATE](cp) { 2107 if (isAsciiUpperHexDigit(cp)) { 2108 this.charRefCode = this.charRefCode * 16 + cp - 0x37; 2109 } else if (isAsciiLowerHexDigit(cp)) { 2110 this.charRefCode = this.charRefCode * 16 + cp - 0x57; 2111 } else if (isAsciiDigit(cp)) { 2112 this.charRefCode = this.charRefCode * 16 + cp - 0x30; 2113 } else if (cp === $.SEMICOLON) { 2114 this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE; 2115 } else { 2116 this._err(ERR.missingSemicolonAfterCharacterReference); 2117 this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE); 2118 } 2119 } 2120 2121 // Decimal character reference state 2122 //------------------------------------------------------------------ 2123 [DECIMAL_CHARACTER_REFERENCE_STATE](cp) { 2124 if (isAsciiDigit(cp)) { 2125 this.charRefCode = this.charRefCode * 10 + cp - 0x30; 2126 } else if (cp === $.SEMICOLON) { 2127 this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE; 2128 } else { 2129 this._err(ERR.missingSemicolonAfterCharacterReference); 2130 this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE); 2131 } 2132 } 2133 2134 // Numeric character reference end state 2135 //------------------------------------------------------------------ 2136 [NUMERIC_CHARACTER_REFERENCE_END_STATE]() { 2137 if (this.charRefCode === $.NULL) { 2138 this._err(ERR.nullCharacterReference); 2139 this.charRefCode = $.REPLACEMENT_CHARACTER; 2140 } else if (this.charRefCode > 0x10ffff) { 2141 this._err(ERR.characterReferenceOutsideUnicodeRange); 2142 this.charRefCode = $.REPLACEMENT_CHARACTER; 2143 } else if (unicode.isSurrogate(this.charRefCode)) { 2144 this._err(ERR.surrogateCharacterReference); 2145 this.charRefCode = $.REPLACEMENT_CHARACTER; 2146 } else if (unicode.isUndefinedCodePoint(this.charRefCode)) { 2147 this._err(ERR.noncharacterCharacterReference); 2148 } else if (unicode.isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) { 2149 this._err(ERR.controlCharacterReference); 2150 2151 const replacement = C1_CONTROLS_REFERENCE_REPLACEMENTS[this.charRefCode]; 2152 2153 if (replacement) { 2154 this.charRefCode = replacement; 2155 } 2156 } 2157 2158 this.tempBuff = [this.charRefCode]; 2159 2160 this._flushCodePointsConsumedAsCharacterReference(); 2161 this._reconsumeInState(this.returnState); 2162 } 2163} 2164 2165//Token types 2166Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN'; 2167Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN'; 2168Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN'; 2169Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN'; 2170Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN'; 2171Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN'; 2172Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN'; 2173Tokenizer.EOF_TOKEN = 'EOF_TOKEN'; 2174Tokenizer.HIBERNATION_TOKEN = 'HIBERNATION_TOKEN'; 2175 2176//Tokenizer initial states for different modes 2177Tokenizer.MODE = { 2178 DATA: DATA_STATE, 2179 RCDATA: RCDATA_STATE, 2180 RAWTEXT: RAWTEXT_STATE, 2181 SCRIPT_DATA: SCRIPT_DATA_STATE, 2182 PLAINTEXT: PLAINTEXT_STATE 2183}; 2184 2185//Static 2186Tokenizer.getTokenAttr = function(token, attrName) { 2187 for (let i = token.attrs.length - 1; i >= 0; i--) { 2188 if (token.attrs[i].name === attrName) { 2189 return token.attrs[i].value; 2190 } 2191 } 2192 2193 return null; 2194}; 2195 2196module.exports = Tokenizer; 2197