1import { Preprocessor } from './preprocessor.js'; 2import { 3 CODE_POINTS as $, 4 SEQUENCES as $$, 5 REPLACEMENT_CHARACTER, 6 isSurrogate, 7 isUndefinedCodePoint, 8 isControlCodePoint, 9} from '../common/unicode.js'; 10import { 11 TokenType, 12 getTokenAttr, 13 type Token, 14 type CharacterToken, 15 type DoctypeToken, 16 type TagToken, 17 type EOFToken, 18 type CommentToken, 19 type Attribute, 20 type Location, 21} from '../common/token.js'; 22import { htmlDecodeTree, BinTrieFlags, determineBranch } from 'entities/lib/decode.js'; 23import { ERR, type ParserErrorHandler } from '../common/error-codes.js'; 24import { TAG_ID, getTagID } from '../common/html.js'; 25 26//C1 Unicode control character reference replacements 27const C1_CONTROLS_REFERENCE_REPLACEMENTS = new Map([ 28 [0x80, 0x20_ac], 29 [0x82, 0x20_1a], 30 [0x83, 0x01_92], 31 [0x84, 0x20_1e], 32 [0x85, 0x20_26], 33 [0x86, 0x20_20], 34 [0x87, 0x20_21], 35 [0x88, 0x02_c6], 36 [0x89, 0x20_30], 37 [0x8a, 0x01_60], 38 [0x8b, 0x20_39], 39 [0x8c, 0x01_52], 40 [0x8e, 0x01_7d], 41 [0x91, 0x20_18], 42 [0x92, 0x20_19], 43 [0x93, 0x20_1c], 44 [0x94, 0x20_1d], 45 [0x95, 0x20_22], 46 [0x96, 0x20_13], 47 [0x97, 0x20_14], 48 [0x98, 0x02_dc], 49 [0x99, 0x21_22], 50 [0x9a, 0x01_61], 51 [0x9b, 0x20_3a], 52 [0x9c, 0x01_53], 53 [0x9e, 0x01_7e], 54 [0x9f, 0x01_78], 55]); 56 57//States 58const enum State { 59 DATA, 60 RCDATA, 61 RAWTEXT, 62 SCRIPT_DATA, 63 PLAINTEXT, 64 TAG_OPEN, 65 END_TAG_OPEN, 66 TAG_NAME, 67 RCDATA_LESS_THAN_SIGN, 68 RCDATA_END_TAG_OPEN, 69 RCDATA_END_TAG_NAME, 70 RAWTEXT_LESS_THAN_SIGN, 71 RAWTEXT_END_TAG_OPEN, 72 RAWTEXT_END_TAG_NAME, 73 SCRIPT_DATA_LESS_THAN_SIGN, 74 SCRIPT_DATA_END_TAG_OPEN, 75 SCRIPT_DATA_END_TAG_NAME, 76 SCRIPT_DATA_ESCAPE_START, 77 SCRIPT_DATA_ESCAPE_START_DASH, 78 SCRIPT_DATA_ESCAPED, 79 SCRIPT_DATA_ESCAPED_DASH, 80 SCRIPT_DATA_ESCAPED_DASH_DASH, 81 SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, 82 SCRIPT_DATA_ESCAPED_END_TAG_OPEN, 83 SCRIPT_DATA_ESCAPED_END_TAG_NAME, 84 SCRIPT_DATA_DOUBLE_ESCAPE_START, 85 SCRIPT_DATA_DOUBLE_ESCAPED, 86 SCRIPT_DATA_DOUBLE_ESCAPED_DASH, 87 SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, 88 SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, 89 SCRIPT_DATA_DOUBLE_ESCAPE_END, 90 BEFORE_ATTRIBUTE_NAME, 91 ATTRIBUTE_NAME, 92 AFTER_ATTRIBUTE_NAME, 93 BEFORE_ATTRIBUTE_VALUE, 94 ATTRIBUTE_VALUE_DOUBLE_QUOTED, 95 ATTRIBUTE_VALUE_SINGLE_QUOTED, 96 ATTRIBUTE_VALUE_UNQUOTED, 97 AFTER_ATTRIBUTE_VALUE_QUOTED, 98 SELF_CLOSING_START_TAG, 99 BOGUS_COMMENT, 100 MARKUP_DECLARATION_OPEN, 101 COMMENT_START, 102 COMMENT_START_DASH, 103 COMMENT, 104 COMMENT_LESS_THAN_SIGN, 105 COMMENT_LESS_THAN_SIGN_BANG, 106 COMMENT_LESS_THAN_SIGN_BANG_DASH, 107 COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, 108 COMMENT_END_DASH, 109 COMMENT_END, 110 COMMENT_END_BANG, 111 DOCTYPE, 112 BEFORE_DOCTYPE_NAME, 113 DOCTYPE_NAME, 114 AFTER_DOCTYPE_NAME, 115 AFTER_DOCTYPE_PUBLIC_KEYWORD, 116 BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, 117 DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, 118 DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, 119 AFTER_DOCTYPE_PUBLIC_IDENTIFIER, 120 BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, 121 AFTER_DOCTYPE_SYSTEM_KEYWORD, 122 BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, 123 DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, 124 DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, 125 AFTER_DOCTYPE_SYSTEM_IDENTIFIER, 126 BOGUS_DOCTYPE, 127 CDATA_SECTION, 128 CDATA_SECTION_BRACKET, 129 CDATA_SECTION_END, 130 CHARACTER_REFERENCE, 131 NAMED_CHARACTER_REFERENCE, 132 AMBIGUOUS_AMPERSAND, 133 NUMERIC_CHARACTER_REFERENCE, 134 HEXADEMICAL_CHARACTER_REFERENCE_START, 135 HEXADEMICAL_CHARACTER_REFERENCE, 136 DECIMAL_CHARACTER_REFERENCE, 137 NUMERIC_CHARACTER_REFERENCE_END, 138} 139 140//Tokenizer initial states for different modes 141export const TokenizerMode = { 142 DATA: State.DATA, 143 RCDATA: State.RCDATA, 144 RAWTEXT: State.RAWTEXT, 145 SCRIPT_DATA: State.SCRIPT_DATA, 146 PLAINTEXT: State.PLAINTEXT, 147 CDATA_SECTION: State.CDATA_SECTION, 148} as const; 149 150//Utils 151 152//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline 153//this functions if they will be situated in another module due to context switch. 154//Always perform inlining check before modifying this functions ('node --trace-inlining'). 155 156function isAsciiDigit(cp: number): boolean { 157 return cp >= $.DIGIT_0 && cp <= $.DIGIT_9; 158} 159 160function isAsciiUpper(cp: number): boolean { 161 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z; 162} 163 164function isAsciiLower(cp: number): boolean { 165 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z; 166} 167 168function isAsciiLetter(cp: number): boolean { 169 return isAsciiLower(cp) || isAsciiUpper(cp); 170} 171 172function isAsciiAlphaNumeric(cp: number): boolean { 173 return isAsciiLetter(cp) || isAsciiDigit(cp); 174} 175 176function isAsciiUpperHexDigit(cp: number): boolean { 177 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F; 178} 179 180function isAsciiLowerHexDigit(cp: number): boolean { 181 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F; 182} 183 184function isAsciiHexDigit(cp: number): boolean { 185 return isAsciiDigit(cp) || isAsciiUpperHexDigit(cp) || isAsciiLowerHexDigit(cp); 186} 187 188function toAsciiLower(cp: number): number { 189 return cp + 0x00_20; 190} 191 192function isWhitespace(cp: number): boolean { 193 return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED; 194} 195 196function isEntityInAttributeInvalidEnd(nextCp: number): boolean { 197 return nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp); 198} 199 200function isScriptDataDoubleEscapeSequenceEnd(cp: number): boolean { 201 return isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN; 202} 203 204const componentValidator = { isSupportedSelfClosing: () => false }; 205 206interface Validator { 207 isSupportedSelfClosing(tagName: string): boolean; 208} 209 210interface CompileResult { 211 jsonTemplate: {}, 212 deps: [], 213 log: { 214 line: number, 215 column: number, 216 reason: string 217 }[] 218} 219 220interface NodeInfo { 221 tn: string, 222 sc: boolean, 223 pos: string 224} 225 226export interface TokenizerOptions { 227 componentValidator?: Validator; 228 compileResult?: CompileResult; 229 sourceCodeLocationInfo?: boolean; 230} 231 232export interface TokenHandler { 233 onComment(token: CommentToken): void; 234 onDoctype(token: DoctypeToken): void; 235 onStartTag(token: TagToken): void; 236 onEndTag(token: TagToken): void; 237 onEof(token: EOFToken): void; 238 onCharacter(token: CharacterToken): void; 239 onNullCharacter(token: CharacterToken): void; 240 onWhitespaceCharacter(token: CharacterToken): void; 241 242 onParseError?: ParserErrorHandler | null; 243} 244 245//Tokenizer 246export class Tokenizer { 247 public preprocessor: Preprocessor; 248 249 private paused = false; 250 /** Ensures that the parsing loop isn't run multiple times at once. */ 251 private inLoop = false; 252 253 /** 254 * Indicates that the current adjusted node exists, is not an element in the HTML namespace, 255 * and that it is not an integration point for either MathML or HTML. 256 * 257 * @see {@link https://html.spec.whatwg.org/multipage/parsing.html#tree-construction} 258 */ 259 public inForeignNode = false; 260 public lastStartTagName = ''; 261 public active = false; 262 263 public nodeInfo: NodeInfo = { tn: '', sc: false, pos: '' }; 264 public validator: Validator = componentValidator; 265 public compileResult: CompileResult = { jsonTemplate: {}, deps: [], log: [] }; 266 267 public state = State.DATA; 268 private returnState = State.DATA; 269 270 private charRefCode = -1; 271 272 private consumedAfterSnapshot = -1; 273 274 private currentLocation: Location | null; 275 private currentCharacterToken: CharacterToken | null = null; 276 private currentToken: Token | null = null; 277 private currentAttr: Attribute = { name: '', value: '' }; 278 279 constructor(private options: TokenizerOptions, private handler: TokenHandler) { 280 this.preprocessor = new Preprocessor(handler); 281 this.currentLocation = this.getCurrentLocation(-1); 282 283 if(options.componentValidator){ 284 this.validator = options.componentValidator; 285 } 286 if(options.compileResult){ 287 this.compileResult = options.compileResult; 288 } 289 } 290 291 //Errors 292 private _err(code: ERR): void { 293 this.handler.onParseError?.(this.preprocessor.getError(code)); 294 } 295 296 // NOTE: `offset` may never run across line boundaries. 297 private getCurrentLocation(offset: number): Location | null { 298 if (!this.options.sourceCodeLocationInfo) { 299 return null; 300 } 301 302 return { 303 startLine: this.preprocessor.line, 304 startCol: this.preprocessor.col - offset, 305 startOffset: this.preprocessor.offset - offset, 306 endLine: -1, 307 endCol: -1, 308 endOffset: -1, 309 }; 310 } 311 312 private _runParsingLoop(): void { 313 if (this.inLoop) return; 314 315 this.inLoop = true; 316 317 while (this.active && !this.paused) { 318 this.consumedAfterSnapshot = 0; 319 320 const cp = this._consume(); 321 322 if (!this._ensureHibernation()) { 323 this._callState(cp); 324 } 325 } 326 327 this.inLoop = false; 328 } 329 330 //API 331 public pause(): void { 332 this.paused = true; 333 } 334 335 public resume(writeCallback?: () => void): void { 336 if (!this.paused) { 337 throw new Error('Parser was already resumed'); 338 } 339 340 this.paused = false; 341 342 // Necessary for synchronous resume. 343 if (this.inLoop) return; 344 345 this._runParsingLoop(); 346 347 if (!this.paused) { 348 writeCallback?.(); 349 } 350 } 351 352 public write(chunk: string, isLastChunk: boolean, writeCallback?: () => void): void { 353 this.active = true; 354 this.preprocessor.write(chunk, isLastChunk); 355 this._runParsingLoop(); 356 357 if (!this.paused) { 358 writeCallback?.(); 359 } 360 } 361 362 public insertHtmlAtCurrentPos(chunk: string): void { 363 this.active = true; 364 this.preprocessor.insertHtmlAtCurrentPos(chunk); 365 this._runParsingLoop(); 366 } 367 368 //Hibernation 369 private _ensureHibernation(): boolean { 370 if (this.preprocessor.endOfChunkHit) { 371 this._unconsume(this.consumedAfterSnapshot); 372 this.active = false; 373 374 return true; 375 } 376 377 return false; 378 } 379 380 //Consumption 381 private _consume(): number { 382 this.consumedAfterSnapshot++; 383 return this.preprocessor.advance(); 384 } 385 386 private _unconsume(count: number): void { 387 this.consumedAfterSnapshot -= count; 388 this.preprocessor.retreat(count); 389 } 390 391 private _reconsumeInState(state: State, cp: number): void { 392 this.state = state; 393 this._callState(cp); 394 } 395 396 private _advanceBy(count: number): void { 397 this.consumedAfterSnapshot += count; 398 for (let i = 0; i < count; i++) { 399 this.preprocessor.advance(); 400 } 401 } 402 403 private _consumeSequenceIfMatch(pattern: string, caseSensitive: boolean): boolean { 404 if (this.preprocessor.startsWith(pattern, caseSensitive)) { 405 // We will already have consumed one character before calling this method. 406 this._advanceBy(pattern.length - 1); 407 return true; 408 } 409 return false; 410 } 411 412 //Token creation 413 private _createStartTagToken(): void { 414 this.currentToken = { 415 type: TokenType.START_TAG, 416 tagName: '', 417 tagID: TAG_ID.UNKNOWN, 418 selfClosing: false, 419 ackSelfClosing: false, 420 attrs: [], 421 location: this.getCurrentLocation(1), 422 }; 423 } 424 425 private _createEndTagToken(): void { 426 this.currentToken = { 427 type: TokenType.END_TAG, 428 tagName: '', 429 tagID: TAG_ID.UNKNOWN, 430 selfClosing: false, 431 ackSelfClosing: false, 432 attrs: [], 433 location: this.getCurrentLocation(2), 434 }; 435 } 436 437 private _createCommentToken(offset: number): void { 438 this.currentToken = { 439 type: TokenType.COMMENT, 440 data: '', 441 location: this.getCurrentLocation(offset), 442 }; 443 } 444 445 private _createDoctypeToken(initialName: string | null): void { 446 this.currentToken = { 447 type: TokenType.DOCTYPE, 448 name: initialName, 449 forceQuirks: false, 450 publicId: null, 451 systemId: null, 452 location: this.currentLocation, 453 }; 454 } 455 456 private _createCharacterToken(type: CharacterToken['type'], chars: string): void { 457 this.currentCharacterToken = { 458 type, 459 chars, 460 location: this.currentLocation, 461 }; 462 } 463 464 //Tag attributes 465 private _createAttr(attrNameFirstCh: string): void { 466 this.currentAttr = { 467 name: attrNameFirstCh, 468 value: '', 469 }; 470 this.currentLocation = this.getCurrentLocation(0); 471 } 472 473 private _leaveAttrName(): void { 474 const token = this.currentToken as TagToken; 475 476 if (getTokenAttr(token, this.currentAttr.name) === null) { 477 token.attrs.push(this.currentAttr); 478 479 if (token.location && this.currentLocation) { 480 const attrLocations = (token.location.attrs ??= Object.create(null)); 481 attrLocations[this.currentAttr.name] = this.currentLocation; 482 483 // Set end location 484 this._leaveAttrValue(); 485 } 486 } else { 487 this._err(ERR.duplicateAttribute); 488 } 489 } 490 491 private _leaveAttrValue(): void { 492 if (this.currentLocation) { 493 this.currentLocation.endLine = this.preprocessor.line; 494 this.currentLocation.endCol = this.preprocessor.col; 495 this.currentLocation.endOffset = this.preprocessor.offset; 496 } 497 } 498 499 //Token emission 500 private prepareToken(ct: Token): void { 501 this._emitCurrentCharacterToken(ct.location); 502 this.currentToken = null; 503 504 if (ct.location) { 505 ct.location.endLine = this.preprocessor.line; 506 ct.location.endCol = this.preprocessor.col + 1; 507 ct.location.endOffset = this.preprocessor.offset + 1; 508 } 509 510 this.currentLocation = this.getCurrentLocation(-1); 511 } 512 513 private emitCurrentTagToken(): void { 514 const ct = this.currentToken as TagToken; 515 checkselfClosingNode(this, ct); 516 this.prepareToken(ct); 517 518 ct.tagID = getTagID(ct.tagName); 519 520 if (ct.type === TokenType.START_TAG) { 521 this.lastStartTagName = ct.tagName; 522 this.handler.onStartTag(ct); 523 } else { 524 if (ct.attrs.length > 0) { 525 this._err(ERR.endTagWithAttributes); 526 } 527 528 if (ct.selfClosing) { 529 this._err(ERR.endTagWithTrailingSolidus); 530 } 531 532 this.handler.onEndTag(ct); 533 } 534 535 this.preprocessor.dropParsedChunk(); 536 } 537 538 private emitCurrentComment(ct: CommentToken): void { 539 this.prepareToken(ct); 540 this.handler.onComment(ct); 541 542 this.preprocessor.dropParsedChunk(); 543 } 544 545 private emitCurrentDoctype(ct: DoctypeToken): void { 546 this.prepareToken(ct); 547 this.handler.onDoctype(ct); 548 549 this.preprocessor.dropParsedChunk(); 550 } 551 552 private _emitCurrentCharacterToken(nextLocation: Location | null): void { 553 if (this.currentCharacterToken) { 554 //NOTE: if we have a pending character token, make it's end location equal to the 555 //current token's start location. 556 if (nextLocation && this.currentCharacterToken.location) { 557 this.currentCharacterToken.location.endLine = nextLocation.startLine; 558 this.currentCharacterToken.location.endCol = nextLocation.startCol; 559 this.currentCharacterToken.location.endOffset = nextLocation.startOffset; 560 } 561 562 switch (this.currentCharacterToken.type) { 563 case TokenType.CHARACTER: { 564 this.handler.onCharacter(this.currentCharacterToken); 565 break; 566 } 567 case TokenType.NULL_CHARACTER: { 568 this.handler.onNullCharacter(this.currentCharacterToken); 569 break; 570 } 571 case TokenType.WHITESPACE_CHARACTER: { 572 this.handler.onWhitespaceCharacter(this.currentCharacterToken); 573 break; 574 } 575 } 576 577 this.currentCharacterToken = null; 578 } 579 } 580 581 private _emitEOFToken(): void { 582 const location = this.getCurrentLocation(0); 583 584 if (location) { 585 location.endLine = location.startLine; 586 location.endCol = location.startCol; 587 location.endOffset = location.startOffset; 588 } 589 590 this._emitCurrentCharacterToken(location); 591 this.handler.onEof({ type: TokenType.EOF, location }); 592 this.active = false; 593 } 594 595 //Characters emission 596 597 //OPTIMIZATION: specification uses only one type of character tokens (one token per character). 598 //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters. 599 //If we have a sequence of characters that belong to the same group, the parser can process it 600 //as a single solid character token. 601 //So, there are 3 types of character tokens in parse5: 602 //1)TokenType.NULL_CHARACTER - \u0000-character sequences (e.g. '\u0000\u0000\u0000') 603 //2)TokenType.WHITESPACE_CHARACTER - any whitespace/new-line character sequences (e.g. '\n \r\t \f') 604 //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') 605 private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void { 606 if (this.currentCharacterToken) { 607 if (this.currentCharacterToken.type !== type) { 608 this.currentLocation = this.getCurrentLocation(0); 609 this._emitCurrentCharacterToken(this.currentLocation); 610 this.preprocessor.dropParsedChunk(); 611 } else { 612 this.currentCharacterToken.chars += ch; 613 return; 614 } 615 } 616 617 this._createCharacterToken(type, ch); 618 } 619 620 private _emitCodePoint(cp: number): void { 621 const type = isWhitespace(cp) 622 ? TokenType.WHITESPACE_CHARACTER 623 : cp === $.NULL 624 ? TokenType.NULL_CHARACTER 625 : TokenType.CHARACTER; 626 627 this._appendCharToCurrentCharacterToken(type, String.fromCodePoint(cp)); 628 } 629 630 //NOTE: used when we emit characters explicitly. 631 //This is always for non-whitespace and non-null characters, which allows us to avoid additional checks. 632 private _emitChars(ch: string): void { 633 this._appendCharToCurrentCharacterToken(TokenType.CHARACTER, ch); 634 } 635 636 // Character reference helpers 637 private _matchNamedCharacterReference(cp: number): number[] | null { 638 let result: number[] | null = null; 639 let excess = 0; 640 let withoutSemicolon = false; 641 642 for (let i = 0, current = htmlDecodeTree[0]; i >= 0; cp = this._consume()) { 643 i = determineBranch(htmlDecodeTree, current, i + 1, cp); 644 645 if (i < 0) break; 646 647 excess += 1; 648 649 current = htmlDecodeTree[i]; 650 651 const masked = current & BinTrieFlags.VALUE_LENGTH; 652 653 // If the branch is a value, store it and continue 654 if (masked) { 655 // The mask is the number of bytes of the value, including the current byte. 656 const valueLength = (masked >> 14) - 1; 657 658 // Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error. 659 // See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state 660 if ( 661 cp !== $.SEMICOLON && 662 this._isCharacterReferenceInAttribute() && 663 isEntityInAttributeInvalidEnd(this.preprocessor.peek(1)) 664 ) { 665 //NOTE: we don't flush all consumed code points here, and instead switch back to the original state after 666 //emitting an ampersand. This is fine, as alphanumeric characters won't be parsed differently in attributes. 667 result = [$.AMPERSAND]; 668 669 // Skip over the value. 670 i += valueLength; 671 } else { 672 // If this is a surrogate pair, consume the next two bytes. 673 result = 674 valueLength === 0 675 ? [htmlDecodeTree[i] & ~BinTrieFlags.VALUE_LENGTH] 676 : valueLength === 1 677 ? [htmlDecodeTree[++i]] 678 : [htmlDecodeTree[++i], htmlDecodeTree[++i]]; 679 excess = 0; 680 withoutSemicolon = cp !== $.SEMICOLON; 681 } 682 683 if (valueLength === 0) { 684 // If the value is zero-length, we're done. 685 this._consume(); 686 break; 687 } 688 } 689 } 690 691 this._unconsume(excess); 692 693 if (withoutSemicolon && !this.preprocessor.endOfChunkHit) { 694 this._err(ERR.missingSemicolonAfterCharacterReference); 695 } 696 697 // We want to emit the error above on the code point after the entity. 698 // We always consume one code point too many in the loop, and we wait to 699 // unconsume it until after the error is emitted. 700 this._unconsume(1); 701 702 return result; 703 } 704 705 private _isCharacterReferenceInAttribute(): boolean { 706 return ( 707 this.returnState === State.ATTRIBUTE_VALUE_DOUBLE_QUOTED || 708 this.returnState === State.ATTRIBUTE_VALUE_SINGLE_QUOTED || 709 this.returnState === State.ATTRIBUTE_VALUE_UNQUOTED 710 ); 711 } 712 713 private _flushCodePointConsumedAsCharacterReference(cp: number): void { 714 if (this._isCharacterReferenceInAttribute()) { 715 this.currentAttr.value += String.fromCodePoint(cp); 716 } else { 717 this._emitCodePoint(cp); 718 } 719 } 720 721 // Calling states this way turns out to be much faster than any other approach. 722 private _callState(cp: number): void { 723 switch (this.state) { 724 case State.DATA: { 725 this._stateData(cp); 726 break; 727 } 728 case State.RCDATA: { 729 this._stateRcdata(cp); 730 break; 731 } 732 case State.RAWTEXT: { 733 this._stateRawtext(cp); 734 break; 735 } 736 case State.SCRIPT_DATA: { 737 this._stateScriptData(cp); 738 break; 739 } 740 case State.PLAINTEXT: { 741 this._statePlaintext(cp); 742 break; 743 } 744 case State.TAG_OPEN: { 745 this._stateTagOpen(cp); 746 break; 747 } 748 case State.END_TAG_OPEN: { 749 this._stateEndTagOpen(cp); 750 break; 751 } 752 case State.TAG_NAME: { 753 this._stateTagName(cp); 754 break; 755 } 756 case State.RCDATA_LESS_THAN_SIGN: { 757 this._stateRcdataLessThanSign(cp); 758 break; 759 } 760 case State.RCDATA_END_TAG_OPEN: { 761 this._stateRcdataEndTagOpen(cp); 762 break; 763 } 764 case State.RCDATA_END_TAG_NAME: { 765 this._stateRcdataEndTagName(cp); 766 break; 767 } 768 case State.RAWTEXT_LESS_THAN_SIGN: { 769 this._stateRawtextLessThanSign(cp); 770 break; 771 } 772 case State.RAWTEXT_END_TAG_OPEN: { 773 this._stateRawtextEndTagOpen(cp); 774 break; 775 } 776 case State.RAWTEXT_END_TAG_NAME: { 777 this._stateRawtextEndTagName(cp); 778 break; 779 } 780 case State.SCRIPT_DATA_LESS_THAN_SIGN: { 781 this._stateScriptDataLessThanSign(cp); 782 break; 783 } 784 case State.SCRIPT_DATA_END_TAG_OPEN: { 785 this._stateScriptDataEndTagOpen(cp); 786 break; 787 } 788 case State.SCRIPT_DATA_END_TAG_NAME: { 789 this._stateScriptDataEndTagName(cp); 790 break; 791 } 792 case State.SCRIPT_DATA_ESCAPE_START: { 793 this._stateScriptDataEscapeStart(cp); 794 break; 795 } 796 case State.SCRIPT_DATA_ESCAPE_START_DASH: { 797 this._stateScriptDataEscapeStartDash(cp); 798 break; 799 } 800 case State.SCRIPT_DATA_ESCAPED: { 801 this._stateScriptDataEscaped(cp); 802 break; 803 } 804 case State.SCRIPT_DATA_ESCAPED_DASH: { 805 this._stateScriptDataEscapedDash(cp); 806 break; 807 } 808 case State.SCRIPT_DATA_ESCAPED_DASH_DASH: { 809 this._stateScriptDataEscapedDashDash(cp); 810 break; 811 } 812 case State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: { 813 this._stateScriptDataEscapedLessThanSign(cp); 814 break; 815 } 816 case State.SCRIPT_DATA_ESCAPED_END_TAG_OPEN: { 817 this._stateScriptDataEscapedEndTagOpen(cp); 818 break; 819 } 820 case State.SCRIPT_DATA_ESCAPED_END_TAG_NAME: { 821 this._stateScriptDataEscapedEndTagName(cp); 822 break; 823 } 824 case State.SCRIPT_DATA_DOUBLE_ESCAPE_START: { 825 this._stateScriptDataDoubleEscapeStart(cp); 826 break; 827 } 828 case State.SCRIPT_DATA_DOUBLE_ESCAPED: { 829 this._stateScriptDataDoubleEscaped(cp); 830 break; 831 } 832 case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH: { 833 this._stateScriptDataDoubleEscapedDash(cp); 834 break; 835 } 836 case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: { 837 this._stateScriptDataDoubleEscapedDashDash(cp); 838 break; 839 } 840 case State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: { 841 this._stateScriptDataDoubleEscapedLessThanSign(cp); 842 break; 843 } 844 case State.SCRIPT_DATA_DOUBLE_ESCAPE_END: { 845 this._stateScriptDataDoubleEscapeEnd(cp); 846 break; 847 } 848 case State.BEFORE_ATTRIBUTE_NAME: { 849 this._stateBeforeAttributeName(cp); 850 break; 851 } 852 case State.ATTRIBUTE_NAME: { 853 this._stateAttributeName(cp); 854 break; 855 } 856 case State.AFTER_ATTRIBUTE_NAME: { 857 this._stateAfterAttributeName(cp); 858 break; 859 } 860 case State.BEFORE_ATTRIBUTE_VALUE: { 861 this._stateBeforeAttributeValue(cp); 862 break; 863 } 864 case State.ATTRIBUTE_VALUE_DOUBLE_QUOTED: { 865 this._stateAttributeValueDoubleQuoted(cp); 866 break; 867 } 868 case State.ATTRIBUTE_VALUE_SINGLE_QUOTED: { 869 this._stateAttributeValueSingleQuoted(cp); 870 break; 871 } 872 case State.ATTRIBUTE_VALUE_UNQUOTED: { 873 this._stateAttributeValueUnquoted(cp); 874 break; 875 } 876 case State.AFTER_ATTRIBUTE_VALUE_QUOTED: { 877 this._stateAfterAttributeValueQuoted(cp); 878 break; 879 } 880 case State.SELF_CLOSING_START_TAG: { 881 this._stateSelfClosingStartTag(cp); 882 break; 883 } 884 case State.BOGUS_COMMENT: { 885 this._stateBogusComment(cp); 886 break; 887 } 888 case State.MARKUP_DECLARATION_OPEN: { 889 this._stateMarkupDeclarationOpen(cp); 890 break; 891 } 892 case State.COMMENT_START: { 893 this._stateCommentStart(cp); 894 break; 895 } 896 case State.COMMENT_START_DASH: { 897 this._stateCommentStartDash(cp); 898 break; 899 } 900 case State.COMMENT: { 901 this._stateComment(cp); 902 break; 903 } 904 case State.COMMENT_LESS_THAN_SIGN: { 905 this._stateCommentLessThanSign(cp); 906 break; 907 } 908 case State.COMMENT_LESS_THAN_SIGN_BANG: { 909 this._stateCommentLessThanSignBang(cp); 910 break; 911 } 912 case State.COMMENT_LESS_THAN_SIGN_BANG_DASH: { 913 this._stateCommentLessThanSignBangDash(cp); 914 break; 915 } 916 case State.COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: { 917 this._stateCommentLessThanSignBangDashDash(cp); 918 break; 919 } 920 case State.COMMENT_END_DASH: { 921 this._stateCommentEndDash(cp); 922 break; 923 } 924 case State.COMMENT_END: { 925 this._stateCommentEnd(cp); 926 break; 927 } 928 case State.COMMENT_END_BANG: { 929 this._stateCommentEndBang(cp); 930 break; 931 } 932 case State.DOCTYPE: { 933 this._stateDoctype(cp); 934 break; 935 } 936 case State.BEFORE_DOCTYPE_NAME: { 937 this._stateBeforeDoctypeName(cp); 938 break; 939 } 940 case State.DOCTYPE_NAME: { 941 this._stateDoctypeName(cp); 942 break; 943 } 944 case State.AFTER_DOCTYPE_NAME: { 945 this._stateAfterDoctypeName(cp); 946 break; 947 } 948 case State.AFTER_DOCTYPE_PUBLIC_KEYWORD: { 949 this._stateAfterDoctypePublicKeyword(cp); 950 break; 951 } 952 case State.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: { 953 this._stateBeforeDoctypePublicIdentifier(cp); 954 break; 955 } 956 case State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: { 957 this._stateDoctypePublicIdentifierDoubleQuoted(cp); 958 break; 959 } 960 case State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: { 961 this._stateDoctypePublicIdentifierSingleQuoted(cp); 962 break; 963 } 964 case State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER: { 965 this._stateAfterDoctypePublicIdentifier(cp); 966 break; 967 } 968 case State.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: { 969 this._stateBetweenDoctypePublicAndSystemIdentifiers(cp); 970 break; 971 } 972 case State.AFTER_DOCTYPE_SYSTEM_KEYWORD: { 973 this._stateAfterDoctypeSystemKeyword(cp); 974 break; 975 } 976 case State.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: { 977 this._stateBeforeDoctypeSystemIdentifier(cp); 978 break; 979 } 980 case State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: { 981 this._stateDoctypeSystemIdentifierDoubleQuoted(cp); 982 break; 983 } 984 case State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: { 985 this._stateDoctypeSystemIdentifierSingleQuoted(cp); 986 break; 987 } 988 case State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER: { 989 this._stateAfterDoctypeSystemIdentifier(cp); 990 break; 991 } 992 case State.BOGUS_DOCTYPE: { 993 this._stateBogusDoctype(cp); 994 break; 995 } 996 case State.CDATA_SECTION: { 997 this._stateCdataSection(cp); 998 break; 999 } 1000 case State.CDATA_SECTION_BRACKET: { 1001 this._stateCdataSectionBracket(cp); 1002 break; 1003 } 1004 case State.CDATA_SECTION_END: { 1005 this._stateCdataSectionEnd(cp); 1006 break; 1007 } 1008 case State.CHARACTER_REFERENCE: { 1009 this._stateCharacterReference(cp); 1010 break; 1011 } 1012 case State.NAMED_CHARACTER_REFERENCE: { 1013 this._stateNamedCharacterReference(cp); 1014 break; 1015 } 1016 case State.AMBIGUOUS_AMPERSAND: { 1017 this._stateAmbiguousAmpersand(cp); 1018 break; 1019 } 1020 case State.NUMERIC_CHARACTER_REFERENCE: { 1021 this._stateNumericCharacterReference(cp); 1022 break; 1023 } 1024 case State.HEXADEMICAL_CHARACTER_REFERENCE_START: { 1025 this._stateHexademicalCharacterReferenceStart(cp); 1026 break; 1027 } 1028 case State.HEXADEMICAL_CHARACTER_REFERENCE: { 1029 this._stateHexademicalCharacterReference(cp); 1030 break; 1031 } 1032 case State.DECIMAL_CHARACTER_REFERENCE: { 1033 this._stateDecimalCharacterReference(cp); 1034 break; 1035 } 1036 case State.NUMERIC_CHARACTER_REFERENCE_END: { 1037 this._stateNumericCharacterReferenceEnd(cp); 1038 break; 1039 } 1040 default: { 1041 throw new Error('Unknown state'); 1042 } 1043 } 1044 } 1045 1046 // State machine 1047 1048 // Data state 1049 //------------------------------------------------------------------ 1050 private _stateData(cp: number): void { 1051 switch (cp) { 1052 case $.LESS_THAN_SIGN: { 1053 this.state = State.TAG_OPEN; 1054 break; 1055 } 1056 case $.AMPERSAND: { 1057 this.returnState = State.DATA; 1058 this.state = State.CHARACTER_REFERENCE; 1059 break; 1060 } 1061 case $.NULL: { 1062 this._err(ERR.unexpectedNullCharacter); 1063 this._emitCodePoint(cp); 1064 break; 1065 } 1066 case $.EOF: { 1067 this._emitEOFToken(); 1068 break; 1069 } 1070 default: { 1071 this._emitCodePoint(cp); 1072 } 1073 } 1074 } 1075 1076 // RCDATA state 1077 //------------------------------------------------------------------ 1078 private _stateRcdata(cp: number): void { 1079 switch (cp) { 1080 case $.AMPERSAND: { 1081 this.returnState = State.RCDATA; 1082 this.state = State.CHARACTER_REFERENCE; 1083 break; 1084 } 1085 case $.LESS_THAN_SIGN: { 1086 this.state = State.RCDATA_LESS_THAN_SIGN; 1087 break; 1088 } 1089 case $.NULL: { 1090 this._err(ERR.unexpectedNullCharacter); 1091 this._emitChars(REPLACEMENT_CHARACTER); 1092 break; 1093 } 1094 case $.EOF: { 1095 this._emitEOFToken(); 1096 break; 1097 } 1098 default: { 1099 this._emitCodePoint(cp); 1100 } 1101 } 1102 } 1103 1104 // RAWTEXT state 1105 //------------------------------------------------------------------ 1106 private _stateRawtext(cp: number): void { 1107 switch (cp) { 1108 case $.LESS_THAN_SIGN: { 1109 this.state = State.RAWTEXT_LESS_THAN_SIGN; 1110 break; 1111 } 1112 case $.NULL: { 1113 this._err(ERR.unexpectedNullCharacter); 1114 this._emitChars(REPLACEMENT_CHARACTER); 1115 break; 1116 } 1117 case $.EOF: { 1118 this._emitEOFToken(); 1119 break; 1120 } 1121 default: { 1122 this._emitCodePoint(cp); 1123 } 1124 } 1125 } 1126 1127 // Script data state 1128 //------------------------------------------------------------------ 1129 private _stateScriptData(cp: number): void { 1130 switch (cp) { 1131 case $.LESS_THAN_SIGN: { 1132 this.state = State.SCRIPT_DATA_LESS_THAN_SIGN; 1133 break; 1134 } 1135 case $.NULL: { 1136 this._err(ERR.unexpectedNullCharacter); 1137 this._emitChars(REPLACEMENT_CHARACTER); 1138 break; 1139 } 1140 case $.EOF: { 1141 this._emitEOFToken(); 1142 break; 1143 } 1144 default: { 1145 this._emitCodePoint(cp); 1146 } 1147 } 1148 } 1149 1150 // PLAINTEXT state 1151 //------------------------------------------------------------------ 1152 private _statePlaintext(cp: number): void { 1153 switch (cp) { 1154 case $.NULL: { 1155 this._err(ERR.unexpectedNullCharacter); 1156 this._emitChars(REPLACEMENT_CHARACTER); 1157 break; 1158 } 1159 case $.EOF: { 1160 this._emitEOFToken(); 1161 break; 1162 } 1163 default: { 1164 this._emitCodePoint(cp); 1165 } 1166 } 1167 } 1168 1169 // Tag open state 1170 //------------------------------------------------------------------ 1171 private _stateTagOpen(cp: number): void { 1172 if (isAsciiLetter(cp)) { 1173 this._createStartTagToken(); 1174 this.state = State.TAG_NAME; 1175 this._stateTagName(cp); 1176 } else 1177 switch (cp) { 1178 case $.EXCLAMATION_MARK: { 1179 this.state = State.MARKUP_DECLARATION_OPEN; 1180 break; 1181 } 1182 case $.SOLIDUS: { 1183 this.state = State.END_TAG_OPEN; 1184 break; 1185 } 1186 case $.QUESTION_MARK: { 1187 this._err(ERR.unexpectedQuestionMarkInsteadOfTagName); 1188 this._createCommentToken(1); 1189 this.state = State.BOGUS_COMMENT; 1190 this._stateBogusComment(cp); 1191 break; 1192 } 1193 case $.EOF: { 1194 this._err(ERR.eofBeforeTagName); 1195 this._emitChars('<'); 1196 this._emitEOFToken(); 1197 break; 1198 } 1199 default: { 1200 this._err(ERR.invalidFirstCharacterOfTagName); 1201 this._emitChars('<'); 1202 this.state = State.DATA; 1203 this._stateData(cp); 1204 } 1205 } 1206 } 1207 1208 // End tag open state 1209 //------------------------------------------------------------------ 1210 private _stateEndTagOpen(cp: number): void { 1211 if (isAsciiLetter(cp)) { 1212 this._createEndTagToken(); 1213 this.state = State.TAG_NAME; 1214 this._stateTagName(cp); 1215 } else 1216 switch (cp) { 1217 case $.GREATER_THAN_SIGN: { 1218 this._err(ERR.missingEndTagName); 1219 this.state = State.DATA; 1220 break; 1221 } 1222 case $.EOF: { 1223 this._err(ERR.eofBeforeTagName); 1224 this._emitChars('</'); 1225 this._emitEOFToken(); 1226 break; 1227 } 1228 default: { 1229 this._err(ERR.invalidFirstCharacterOfTagName); 1230 this._createCommentToken(2); 1231 this.state = State.BOGUS_COMMENT; 1232 this._stateBogusComment(cp); 1233 } 1234 } 1235 } 1236 1237 // Tag name state 1238 //------------------------------------------------------------------ 1239 private _stateTagName(cp: number): void { 1240 const token = this.currentToken as TagToken; 1241 1242 switch (cp) { 1243 case $.SPACE: 1244 case $.LINE_FEED: 1245 case $.TABULATION: 1246 case $.FORM_FEED: { 1247 this.state = State.BEFORE_ATTRIBUTE_NAME; 1248 break; 1249 } 1250 case $.SOLIDUS: { 1251 this.state = State.SELF_CLOSING_START_TAG; 1252 break; 1253 } 1254 case $.GREATER_THAN_SIGN: { 1255 this.state = State.DATA; 1256 this.emitCurrentTagToken(); 1257 break; 1258 } 1259 case $.NULL: { 1260 this._err(ERR.unexpectedNullCharacter); 1261 token.tagName += REPLACEMENT_CHARACTER; 1262 break; 1263 } 1264 case $.EOF: { 1265 this._err(ERR.eofInTag); 1266 this._emitEOFToken(); 1267 break; 1268 } 1269 default: { 1270 token.tagName += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp); 1271 } 1272 } 1273 } 1274 1275 // RCDATA less-than sign state 1276 //------------------------------------------------------------------ 1277 private _stateRcdataLessThanSign(cp: number): void { 1278 if (cp === $.SOLIDUS) { 1279 this.state = State.RCDATA_END_TAG_OPEN; 1280 } else { 1281 this._emitChars('<'); 1282 this.state = State.RCDATA; 1283 this._stateRcdata(cp); 1284 } 1285 } 1286 1287 // RCDATA end tag open state 1288 //------------------------------------------------------------------ 1289 private _stateRcdataEndTagOpen(cp: number): void { 1290 if (isAsciiLetter(cp)) { 1291 this.state = State.RCDATA_END_TAG_NAME; 1292 this._stateRcdataEndTagName(cp); 1293 } else { 1294 this._emitChars('</'); 1295 this.state = State.RCDATA; 1296 this._stateRcdata(cp); 1297 } 1298 } 1299 1300 private handleSpecialEndTag(_cp: number): boolean { 1301 if (!this.preprocessor.startsWith(this.lastStartTagName, false)) { 1302 return !this._ensureHibernation(); 1303 } 1304 1305 this._createEndTagToken(); 1306 const token = this.currentToken as TagToken; 1307 token.tagName = this.lastStartTagName; 1308 1309 const cp = this.preprocessor.peek(this.lastStartTagName.length); 1310 1311 switch (cp) { 1312 case $.SPACE: 1313 case $.LINE_FEED: 1314 case $.TABULATION: 1315 case $.FORM_FEED: { 1316 this._advanceBy(this.lastStartTagName.length); 1317 this.state = State.BEFORE_ATTRIBUTE_NAME; 1318 return false; 1319 } 1320 case $.SOLIDUS: { 1321 this._advanceBy(this.lastStartTagName.length); 1322 this.state = State.SELF_CLOSING_START_TAG; 1323 return false; 1324 } 1325 case $.GREATER_THAN_SIGN: { 1326 this._advanceBy(this.lastStartTagName.length); 1327 this.emitCurrentTagToken(); 1328 this.state = State.DATA; 1329 return false; 1330 } 1331 default: { 1332 return !this._ensureHibernation(); 1333 } 1334 } 1335 } 1336 1337 // RCDATA end tag name state 1338 //------------------------------------------------------------------ 1339 private _stateRcdataEndTagName(cp: number): void { 1340 if (this.handleSpecialEndTag(cp)) { 1341 this._emitChars('</'); 1342 this.state = State.RCDATA; 1343 this._stateRcdata(cp); 1344 } 1345 } 1346 1347 // RAWTEXT less-than sign state 1348 //------------------------------------------------------------------ 1349 private _stateRawtextLessThanSign(cp: number): void { 1350 if (cp === $.SOLIDUS) { 1351 this.state = State.RAWTEXT_END_TAG_OPEN; 1352 } else { 1353 this._emitChars('<'); 1354 this.state = State.RAWTEXT; 1355 this._stateRawtext(cp); 1356 } 1357 } 1358 1359 // RAWTEXT end tag open state 1360 //------------------------------------------------------------------ 1361 private _stateRawtextEndTagOpen(cp: number): void { 1362 if (isAsciiLetter(cp)) { 1363 this.state = State.RAWTEXT_END_TAG_NAME; 1364 this._stateRawtextEndTagName(cp); 1365 } else { 1366 this._emitChars('</'); 1367 this.state = State.RAWTEXT; 1368 this._stateRawtext(cp); 1369 } 1370 } 1371 1372 // RAWTEXT end tag name state 1373 //------------------------------------------------------------------ 1374 private _stateRawtextEndTagName(cp: number): void { 1375 if (this.handleSpecialEndTag(cp)) { 1376 this._emitChars('</'); 1377 this.state = State.RAWTEXT; 1378 this._stateRawtext(cp); 1379 } 1380 } 1381 1382 // Script data less-than sign state 1383 //------------------------------------------------------------------ 1384 private _stateScriptDataLessThanSign(cp: number): void { 1385 switch (cp) { 1386 case $.SOLIDUS: { 1387 this.state = State.SCRIPT_DATA_END_TAG_OPEN; 1388 break; 1389 } 1390 case $.EXCLAMATION_MARK: { 1391 this.state = State.SCRIPT_DATA_ESCAPE_START; 1392 this._emitChars('<!'); 1393 break; 1394 } 1395 default: { 1396 this._emitChars('<'); 1397 this.state = State.SCRIPT_DATA; 1398 this._stateScriptData(cp); 1399 } 1400 } 1401 } 1402 1403 // Script data end tag open state 1404 //------------------------------------------------------------------ 1405 private _stateScriptDataEndTagOpen(cp: number): void { 1406 if (isAsciiLetter(cp)) { 1407 this.state = State.SCRIPT_DATA_END_TAG_NAME; 1408 this._stateScriptDataEndTagName(cp); 1409 } else { 1410 this._emitChars('</'); 1411 this.state = State.SCRIPT_DATA; 1412 this._stateScriptData(cp); 1413 } 1414 } 1415 1416 // Script data end tag name state 1417 //------------------------------------------------------------------ 1418 private _stateScriptDataEndTagName(cp: number): void { 1419 if (this.handleSpecialEndTag(cp)) { 1420 this._emitChars('</'); 1421 this.state = State.SCRIPT_DATA; 1422 this._stateScriptData(cp); 1423 } 1424 } 1425 1426 // Script data escape start state 1427 //------------------------------------------------------------------ 1428 private _stateScriptDataEscapeStart(cp: number): void { 1429 if (cp === $.HYPHEN_MINUS) { 1430 this.state = State.SCRIPT_DATA_ESCAPE_START_DASH; 1431 this._emitChars('-'); 1432 } else { 1433 this.state = State.SCRIPT_DATA; 1434 this._stateScriptData(cp); 1435 } 1436 } 1437 1438 // Script data escape start dash state 1439 //------------------------------------------------------------------ 1440 private _stateScriptDataEscapeStartDash(cp: number): void { 1441 if (cp === $.HYPHEN_MINUS) { 1442 this.state = State.SCRIPT_DATA_ESCAPED_DASH_DASH; 1443 this._emitChars('-'); 1444 } else { 1445 this.state = State.SCRIPT_DATA; 1446 this._stateScriptData(cp); 1447 } 1448 } 1449 1450 // Script data escaped state 1451 //------------------------------------------------------------------ 1452 private _stateScriptDataEscaped(cp: number): void { 1453 switch (cp) { 1454 case $.HYPHEN_MINUS: { 1455 this.state = State.SCRIPT_DATA_ESCAPED_DASH; 1456 this._emitChars('-'); 1457 break; 1458 } 1459 case $.LESS_THAN_SIGN: { 1460 this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; 1461 break; 1462 } 1463 case $.NULL: { 1464 this._err(ERR.unexpectedNullCharacter); 1465 this._emitChars(REPLACEMENT_CHARACTER); 1466 break; 1467 } 1468 case $.EOF: { 1469 this._err(ERR.eofInScriptHtmlCommentLikeText); 1470 this._emitEOFToken(); 1471 break; 1472 } 1473 default: { 1474 this._emitCodePoint(cp); 1475 } 1476 } 1477 } 1478 1479 // Script data escaped dash state 1480 //------------------------------------------------------------------ 1481 private _stateScriptDataEscapedDash(cp: number): void { 1482 switch (cp) { 1483 case $.HYPHEN_MINUS: { 1484 this.state = State.SCRIPT_DATA_ESCAPED_DASH_DASH; 1485 this._emitChars('-'); 1486 break; 1487 } 1488 case $.LESS_THAN_SIGN: { 1489 this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; 1490 break; 1491 } 1492 case $.NULL: { 1493 this._err(ERR.unexpectedNullCharacter); 1494 this.state = State.SCRIPT_DATA_ESCAPED; 1495 this._emitChars(REPLACEMENT_CHARACTER); 1496 break; 1497 } 1498 case $.EOF: { 1499 this._err(ERR.eofInScriptHtmlCommentLikeText); 1500 this._emitEOFToken(); 1501 break; 1502 } 1503 default: { 1504 this.state = State.SCRIPT_DATA_ESCAPED; 1505 this._emitCodePoint(cp); 1506 } 1507 } 1508 } 1509 1510 // Script data escaped dash dash state 1511 //------------------------------------------------------------------ 1512 private _stateScriptDataEscapedDashDash(cp: number): void { 1513 switch (cp) { 1514 case $.HYPHEN_MINUS: { 1515 this._emitChars('-'); 1516 break; 1517 } 1518 case $.LESS_THAN_SIGN: { 1519 this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; 1520 break; 1521 } 1522 case $.GREATER_THAN_SIGN: { 1523 this.state = State.SCRIPT_DATA; 1524 this._emitChars('>'); 1525 break; 1526 } 1527 case $.NULL: { 1528 this._err(ERR.unexpectedNullCharacter); 1529 this.state = State.SCRIPT_DATA_ESCAPED; 1530 this._emitChars(REPLACEMENT_CHARACTER); 1531 break; 1532 } 1533 case $.EOF: { 1534 this._err(ERR.eofInScriptHtmlCommentLikeText); 1535 this._emitEOFToken(); 1536 break; 1537 } 1538 default: { 1539 this.state = State.SCRIPT_DATA_ESCAPED; 1540 this._emitCodePoint(cp); 1541 } 1542 } 1543 } 1544 1545 // Script data escaped less-than sign state 1546 //------------------------------------------------------------------ 1547 private _stateScriptDataEscapedLessThanSign(cp: number): void { 1548 if (cp === $.SOLIDUS) { 1549 this.state = State.SCRIPT_DATA_ESCAPED_END_TAG_OPEN; 1550 } else if (isAsciiLetter(cp)) { 1551 this._emitChars('<'); 1552 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPE_START; 1553 this._stateScriptDataDoubleEscapeStart(cp); 1554 } else { 1555 this._emitChars('<'); 1556 this.state = State.SCRIPT_DATA_ESCAPED; 1557 this._stateScriptDataEscaped(cp); 1558 } 1559 } 1560 1561 // Script data escaped end tag open state 1562 //------------------------------------------------------------------ 1563 private _stateScriptDataEscapedEndTagOpen(cp: number): void { 1564 if (isAsciiLetter(cp)) { 1565 this.state = State.SCRIPT_DATA_ESCAPED_END_TAG_NAME; 1566 this._stateScriptDataEscapedEndTagName(cp); 1567 } else { 1568 this._emitChars('</'); 1569 this.state = State.SCRIPT_DATA_ESCAPED; 1570 this._stateScriptDataEscaped(cp); 1571 } 1572 } 1573 1574 // Script data escaped end tag name state 1575 //------------------------------------------------------------------ 1576 private _stateScriptDataEscapedEndTagName(cp: number): void { 1577 if (this.handleSpecialEndTag(cp)) { 1578 this._emitChars('</'); 1579 this.state = State.SCRIPT_DATA_ESCAPED; 1580 this._stateScriptDataEscaped(cp); 1581 } 1582 } 1583 1584 // Script data double escape start state 1585 //------------------------------------------------------------------ 1586 private _stateScriptDataDoubleEscapeStart(cp: number): void { 1587 if ( 1588 this.preprocessor.startsWith($$.SCRIPT, false) && 1589 isScriptDataDoubleEscapeSequenceEnd(this.preprocessor.peek($$.SCRIPT.length)) 1590 ) { 1591 this._emitCodePoint(cp); 1592 for (let i = 0; i < $$.SCRIPT.length; i++) { 1593 this._emitCodePoint(this._consume()); 1594 } 1595 1596 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1597 } else if (!this._ensureHibernation()) { 1598 this.state = State.SCRIPT_DATA_ESCAPED; 1599 this._stateScriptDataEscaped(cp); 1600 } 1601 } 1602 1603 // Script data double escaped state 1604 //------------------------------------------------------------------ 1605 private _stateScriptDataDoubleEscaped(cp: number): void { 1606 switch (cp) { 1607 case $.HYPHEN_MINUS: { 1608 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH; 1609 this._emitChars('-'); 1610 break; 1611 } 1612 case $.LESS_THAN_SIGN: { 1613 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; 1614 this._emitChars('<'); 1615 break; 1616 } 1617 case $.NULL: { 1618 this._err(ERR.unexpectedNullCharacter); 1619 this._emitChars(REPLACEMENT_CHARACTER); 1620 break; 1621 } 1622 case $.EOF: { 1623 this._err(ERR.eofInScriptHtmlCommentLikeText); 1624 this._emitEOFToken(); 1625 break; 1626 } 1627 default: { 1628 this._emitCodePoint(cp); 1629 } 1630 } 1631 } 1632 1633 // Script data double escaped dash state 1634 //------------------------------------------------------------------ 1635 private _stateScriptDataDoubleEscapedDash(cp: number): void { 1636 switch (cp) { 1637 case $.HYPHEN_MINUS: { 1638 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; 1639 this._emitChars('-'); 1640 break; 1641 } 1642 case $.LESS_THAN_SIGN: { 1643 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; 1644 this._emitChars('<'); 1645 break; 1646 } 1647 case $.NULL: { 1648 this._err(ERR.unexpectedNullCharacter); 1649 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1650 this._emitChars(REPLACEMENT_CHARACTER); 1651 break; 1652 } 1653 case $.EOF: { 1654 this._err(ERR.eofInScriptHtmlCommentLikeText); 1655 this._emitEOFToken(); 1656 break; 1657 } 1658 default: { 1659 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1660 this._emitCodePoint(cp); 1661 } 1662 } 1663 } 1664 1665 // Script data double escaped dash dash state 1666 //------------------------------------------------------------------ 1667 private _stateScriptDataDoubleEscapedDashDash(cp: number): void { 1668 switch (cp) { 1669 case $.HYPHEN_MINUS: { 1670 this._emitChars('-'); 1671 break; 1672 } 1673 case $.LESS_THAN_SIGN: { 1674 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; 1675 this._emitChars('<'); 1676 break; 1677 } 1678 case $.GREATER_THAN_SIGN: { 1679 this.state = State.SCRIPT_DATA; 1680 this._emitChars('>'); 1681 break; 1682 } 1683 case $.NULL: { 1684 this._err(ERR.unexpectedNullCharacter); 1685 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1686 this._emitChars(REPLACEMENT_CHARACTER); 1687 break; 1688 } 1689 case $.EOF: { 1690 this._err(ERR.eofInScriptHtmlCommentLikeText); 1691 this._emitEOFToken(); 1692 break; 1693 } 1694 default: { 1695 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1696 this._emitCodePoint(cp); 1697 } 1698 } 1699 } 1700 1701 // Script data double escaped less-than sign state 1702 //------------------------------------------------------------------ 1703 private _stateScriptDataDoubleEscapedLessThanSign(cp: number): void { 1704 if (cp === $.SOLIDUS) { 1705 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPE_END; 1706 this._emitChars('/'); 1707 } else { 1708 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1709 this._stateScriptDataDoubleEscaped(cp); 1710 } 1711 } 1712 1713 // Script data double escape end state 1714 //------------------------------------------------------------------ 1715 private _stateScriptDataDoubleEscapeEnd(cp: number): void { 1716 if ( 1717 this.preprocessor.startsWith($$.SCRIPT, false) && 1718 isScriptDataDoubleEscapeSequenceEnd(this.preprocessor.peek($$.SCRIPT.length)) 1719 ) { 1720 this._emitCodePoint(cp); 1721 for (let i = 0; i < $$.SCRIPT.length; i++) { 1722 this._emitCodePoint(this._consume()); 1723 } 1724 1725 this.state = State.SCRIPT_DATA_ESCAPED; 1726 } else if (!this._ensureHibernation()) { 1727 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1728 this._stateScriptDataDoubleEscaped(cp); 1729 } 1730 } 1731 1732 // Before attribute name state 1733 //------------------------------------------------------------------ 1734 private _stateBeforeAttributeName(cp: number): void { 1735 switch (cp) { 1736 case $.SPACE: 1737 case $.LINE_FEED: 1738 case $.TABULATION: 1739 case $.FORM_FEED: { 1740 // Ignore whitespace 1741 break; 1742 } 1743 case $.SOLIDUS: 1744 case $.GREATER_THAN_SIGN: 1745 case $.EOF: { 1746 this.state = State.AFTER_ATTRIBUTE_NAME; 1747 this._stateAfterAttributeName(cp); 1748 break; 1749 } 1750 case $.EQUALS_SIGN: { 1751 this._err(ERR.unexpectedEqualsSignBeforeAttributeName); 1752 this._createAttr('='); 1753 this.state = State.ATTRIBUTE_NAME; 1754 break; 1755 } 1756 default: { 1757 this._createAttr(''); 1758 this.state = State.ATTRIBUTE_NAME; 1759 this._stateAttributeName(cp); 1760 } 1761 } 1762 } 1763 1764 // Attribute name state 1765 //------------------------------------------------------------------ 1766 private _stateAttributeName(cp: number): void { 1767 switch (cp) { 1768 case $.SPACE: 1769 case $.LINE_FEED: 1770 case $.TABULATION: 1771 case $.FORM_FEED: 1772 case $.SOLIDUS: 1773 case $.GREATER_THAN_SIGN: 1774 case $.EOF: { 1775 this._leaveAttrName(); 1776 this.state = State.AFTER_ATTRIBUTE_NAME; 1777 this._stateAfterAttributeName(cp); 1778 break; 1779 } 1780 case $.EQUALS_SIGN: { 1781 this._leaveAttrName(); 1782 this.state = State.BEFORE_ATTRIBUTE_VALUE; 1783 break; 1784 } 1785 case $.QUOTATION_MARK: 1786 case $.APOSTROPHE: 1787 case $.LESS_THAN_SIGN: { 1788 this._err(ERR.unexpectedCharacterInAttributeName); 1789 this.currentAttr.name += String.fromCodePoint(cp); 1790 break; 1791 } 1792 case $.NULL: { 1793 this._err(ERR.unexpectedNullCharacter); 1794 this.currentAttr.name += REPLACEMENT_CHARACTER; 1795 break; 1796 } 1797 default: { 1798 this.currentAttr.name += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp); 1799 } 1800 } 1801 } 1802 1803 // After attribute name state 1804 //------------------------------------------------------------------ 1805 private _stateAfterAttributeName(cp: number): void { 1806 switch (cp) { 1807 case $.SPACE: 1808 case $.LINE_FEED: 1809 case $.TABULATION: 1810 case $.FORM_FEED: { 1811 // Ignore whitespace 1812 break; 1813 } 1814 case $.SOLIDUS: { 1815 this.state = State.SELF_CLOSING_START_TAG; 1816 break; 1817 } 1818 case $.EQUALS_SIGN: { 1819 this.state = State.BEFORE_ATTRIBUTE_VALUE; 1820 break; 1821 } 1822 case $.GREATER_THAN_SIGN: { 1823 this.state = State.DATA; 1824 this.emitCurrentTagToken(); 1825 break; 1826 } 1827 case $.EOF: { 1828 this._err(ERR.eofInTag); 1829 this._emitEOFToken(); 1830 break; 1831 } 1832 default: { 1833 this._createAttr(''); 1834 this.state = State.ATTRIBUTE_NAME; 1835 this._stateAttributeName(cp); 1836 } 1837 } 1838 } 1839 1840 // Before attribute value state 1841 //------------------------------------------------------------------ 1842 private _stateBeforeAttributeValue(cp: number): void { 1843 switch (cp) { 1844 case $.SPACE: 1845 case $.LINE_FEED: 1846 case $.TABULATION: 1847 case $.FORM_FEED: { 1848 // Ignore whitespace 1849 break; 1850 } 1851 case $.QUOTATION_MARK: { 1852 this.state = State.ATTRIBUTE_VALUE_DOUBLE_QUOTED; 1853 break; 1854 } 1855 case $.APOSTROPHE: { 1856 this.state = State.ATTRIBUTE_VALUE_SINGLE_QUOTED; 1857 break; 1858 } 1859 case $.GREATER_THAN_SIGN: { 1860 this._err(ERR.missingAttributeValue); 1861 this.state = State.DATA; 1862 this.emitCurrentTagToken(); 1863 break; 1864 } 1865 default: { 1866 this.state = State.ATTRIBUTE_VALUE_UNQUOTED; 1867 this._stateAttributeValueUnquoted(cp); 1868 } 1869 } 1870 } 1871 1872 // Attribute value (double-quoted) state 1873 //------------------------------------------------------------------ 1874 private _stateAttributeValueDoubleQuoted(cp: number): void { 1875 switch (cp) { 1876 case $.QUOTATION_MARK: { 1877 this.state = State.AFTER_ATTRIBUTE_VALUE_QUOTED; 1878 break; 1879 } 1880 case $.AMPERSAND: { 1881 this.returnState = State.ATTRIBUTE_VALUE_DOUBLE_QUOTED; 1882 this.state = State.CHARACTER_REFERENCE; 1883 break; 1884 } 1885 case $.NULL: { 1886 this._err(ERR.unexpectedNullCharacter); 1887 this.currentAttr.value += REPLACEMENT_CHARACTER; 1888 break; 1889 } 1890 case $.EOF: { 1891 this._err(ERR.eofInTag); 1892 this._emitEOFToken(); 1893 break; 1894 } 1895 default: { 1896 this.currentAttr.value += String.fromCodePoint(cp); 1897 } 1898 } 1899 } 1900 1901 // Attribute value (single-quoted) state 1902 //------------------------------------------------------------------ 1903 private _stateAttributeValueSingleQuoted(cp: number): void { 1904 switch (cp) { 1905 case $.APOSTROPHE: { 1906 this.state = State.AFTER_ATTRIBUTE_VALUE_QUOTED; 1907 break; 1908 } 1909 case $.AMPERSAND: { 1910 this.returnState = State.ATTRIBUTE_VALUE_SINGLE_QUOTED; 1911 this.state = State.CHARACTER_REFERENCE; 1912 break; 1913 } 1914 case $.NULL: { 1915 this._err(ERR.unexpectedNullCharacter); 1916 this.currentAttr.value += REPLACEMENT_CHARACTER; 1917 break; 1918 } 1919 case $.EOF: { 1920 this._err(ERR.eofInTag); 1921 this._emitEOFToken(); 1922 break; 1923 } 1924 default: { 1925 this.currentAttr.value += String.fromCodePoint(cp); 1926 } 1927 } 1928 } 1929 1930 // Attribute value (unquoted) state 1931 //------------------------------------------------------------------ 1932 private _stateAttributeValueUnquoted(cp: number): void { 1933 switch (cp) { 1934 case $.SPACE: 1935 case $.LINE_FEED: 1936 case $.TABULATION: 1937 case $.FORM_FEED: { 1938 this._leaveAttrValue(); 1939 this.state = State.BEFORE_ATTRIBUTE_NAME; 1940 break; 1941 } 1942 case $.AMPERSAND: { 1943 this.returnState = State.ATTRIBUTE_VALUE_UNQUOTED; 1944 this.state = State.CHARACTER_REFERENCE; 1945 break; 1946 } 1947 case $.GREATER_THAN_SIGN: { 1948 this._leaveAttrValue(); 1949 this.state = State.DATA; 1950 this.emitCurrentTagToken(); 1951 break; 1952 } 1953 case $.NULL: { 1954 this._err(ERR.unexpectedNullCharacter); 1955 this.currentAttr.value += REPLACEMENT_CHARACTER; 1956 break; 1957 } 1958 case $.QUOTATION_MARK: 1959 case $.APOSTROPHE: 1960 case $.LESS_THAN_SIGN: 1961 case $.EQUALS_SIGN: 1962 case $.GRAVE_ACCENT: { 1963 this._err(ERR.unexpectedCharacterInUnquotedAttributeValue); 1964 this.currentAttr.value += String.fromCodePoint(cp); 1965 break; 1966 } 1967 case $.EOF: { 1968 this._err(ERR.eofInTag); 1969 this._emitEOFToken(); 1970 break; 1971 } 1972 default: { 1973 this.currentAttr.value += String.fromCodePoint(cp); 1974 } 1975 } 1976 } 1977 1978 // After attribute value (quoted) state 1979 //------------------------------------------------------------------ 1980 private _stateAfterAttributeValueQuoted(cp: number): void { 1981 switch (cp) { 1982 case $.SPACE: 1983 case $.LINE_FEED: 1984 case $.TABULATION: 1985 case $.FORM_FEED: { 1986 this._leaveAttrValue(); 1987 this.state = State.BEFORE_ATTRIBUTE_NAME; 1988 break; 1989 } 1990 case $.SOLIDUS: { 1991 this._leaveAttrValue(); 1992 this.state = State.SELF_CLOSING_START_TAG; 1993 break; 1994 } 1995 case $.GREATER_THAN_SIGN: { 1996 this._leaveAttrValue(); 1997 this.state = State.DATA; 1998 this.emitCurrentTagToken(); 1999 break; 2000 } 2001 case $.EOF: { 2002 this._err(ERR.eofInTag); 2003 this._emitEOFToken(); 2004 break; 2005 } 2006 default: { 2007 this._err(ERR.missingWhitespaceBetweenAttributes); 2008 this.state = State.BEFORE_ATTRIBUTE_NAME; 2009 this._stateBeforeAttributeName(cp); 2010 } 2011 } 2012 } 2013 2014 // Self-closing start tag state 2015 //------------------------------------------------------------------ 2016 private _stateSelfClosingStartTag(cp: number): void { 2017 switch (cp) { 2018 case $.GREATER_THAN_SIGN: { 2019 const token = this.currentToken as TagToken; 2020 token.selfClosing = true; 2021 this.state = State.DATA; 2022 this.emitCurrentTagToken(); 2023 break; 2024 } 2025 case $.EOF: { 2026 this._err(ERR.eofInTag); 2027 this._emitEOFToken(); 2028 break; 2029 } 2030 default: { 2031 this._err(ERR.unexpectedSolidusInTag); 2032 this.state = State.BEFORE_ATTRIBUTE_NAME; 2033 this._stateBeforeAttributeName(cp); 2034 } 2035 } 2036 } 2037 2038 // Bogus comment state 2039 //------------------------------------------------------------------ 2040 private _stateBogusComment(cp: number): void { 2041 const token = this.currentToken as CommentToken; 2042 2043 switch (cp) { 2044 case $.GREATER_THAN_SIGN: { 2045 this.state = State.DATA; 2046 this.emitCurrentComment(token); 2047 break; 2048 } 2049 case $.EOF: { 2050 this.emitCurrentComment(token); 2051 this._emitEOFToken(); 2052 break; 2053 } 2054 case $.NULL: { 2055 this._err(ERR.unexpectedNullCharacter); 2056 token.data += REPLACEMENT_CHARACTER; 2057 break; 2058 } 2059 default: { 2060 token.data += String.fromCodePoint(cp); 2061 } 2062 } 2063 } 2064 2065 // Markup declaration open state 2066 //------------------------------------------------------------------ 2067 private _stateMarkupDeclarationOpen(cp: number): void { 2068 if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) { 2069 this._createCommentToken($$.DASH_DASH.length + 1); 2070 this.state = State.COMMENT_START; 2071 } else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) { 2072 // NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here. 2073 this.currentLocation = this.getCurrentLocation($$.DOCTYPE.length + 1); 2074 this.state = State.DOCTYPE; 2075 } else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) { 2076 if (this.inForeignNode) { 2077 this.state = State.CDATA_SECTION; 2078 } else { 2079 this._err(ERR.cdataInHtmlContent); 2080 this._createCommentToken($$.CDATA_START.length + 1); 2081 (this.currentToken as CommentToken).data = '[CDATA['; 2082 this.state = State.BOGUS_COMMENT; 2083 } 2084 } 2085 2086 //NOTE: Sequence lookups can be abrupted by hibernation. In that case, lookup 2087 //results are no longer valid and we will need to start over. 2088 else if (!this._ensureHibernation()) { 2089 this._err(ERR.incorrectlyOpenedComment); 2090 this._createCommentToken(2); 2091 this.state = State.BOGUS_COMMENT; 2092 this._stateBogusComment(cp); 2093 } 2094 } 2095 2096 // Comment start state 2097 //------------------------------------------------------------------ 2098 private _stateCommentStart(cp: number): void { 2099 switch (cp) { 2100 case $.HYPHEN_MINUS: { 2101 this.state = State.COMMENT_START_DASH; 2102 break; 2103 } 2104 case $.GREATER_THAN_SIGN: { 2105 this._err(ERR.abruptClosingOfEmptyComment); 2106 this.state = State.DATA; 2107 const token = this.currentToken as CommentToken; 2108 this.emitCurrentComment(token); 2109 break; 2110 } 2111 default: { 2112 this.state = State.COMMENT; 2113 this._stateComment(cp); 2114 } 2115 } 2116 } 2117 2118 // Comment start dash state 2119 //------------------------------------------------------------------ 2120 private _stateCommentStartDash(cp: number): void { 2121 const token = this.currentToken as CommentToken; 2122 switch (cp) { 2123 case $.HYPHEN_MINUS: { 2124 this.state = State.COMMENT_END; 2125 break; 2126 } 2127 case $.GREATER_THAN_SIGN: { 2128 this._err(ERR.abruptClosingOfEmptyComment); 2129 this.state = State.DATA; 2130 this.emitCurrentComment(token); 2131 break; 2132 } 2133 case $.EOF: { 2134 this._err(ERR.eofInComment); 2135 this.emitCurrentComment(token); 2136 this._emitEOFToken(); 2137 break; 2138 } 2139 default: { 2140 token.data += '-'; 2141 this.state = State.COMMENT; 2142 this._stateComment(cp); 2143 } 2144 } 2145 } 2146 2147 // Comment state 2148 //------------------------------------------------------------------ 2149 private _stateComment(cp: number): void { 2150 const token = this.currentToken as CommentToken; 2151 2152 switch (cp) { 2153 case $.HYPHEN_MINUS: { 2154 this.state = State.COMMENT_END_DASH; 2155 break; 2156 } 2157 case $.LESS_THAN_SIGN: { 2158 token.data += '<'; 2159 this.state = State.COMMENT_LESS_THAN_SIGN; 2160 break; 2161 } 2162 case $.NULL: { 2163 this._err(ERR.unexpectedNullCharacter); 2164 token.data += REPLACEMENT_CHARACTER; 2165 break; 2166 } 2167 case $.EOF: { 2168 this._err(ERR.eofInComment); 2169 this.emitCurrentComment(token); 2170 this._emitEOFToken(); 2171 break; 2172 } 2173 default: { 2174 token.data += String.fromCodePoint(cp); 2175 } 2176 } 2177 } 2178 2179 // Comment less-than sign state 2180 //------------------------------------------------------------------ 2181 private _stateCommentLessThanSign(cp: number): void { 2182 const token = this.currentToken as CommentToken; 2183 2184 switch (cp) { 2185 case $.EXCLAMATION_MARK: { 2186 token.data += '!'; 2187 this.state = State.COMMENT_LESS_THAN_SIGN_BANG; 2188 break; 2189 } 2190 case $.LESS_THAN_SIGN: { 2191 token.data += '<'; 2192 break; 2193 } 2194 default: { 2195 this.state = State.COMMENT; 2196 this._stateComment(cp); 2197 } 2198 } 2199 } 2200 2201 // Comment less-than sign bang state 2202 //------------------------------------------------------------------ 2203 private _stateCommentLessThanSignBang(cp: number): void { 2204 if (cp === $.HYPHEN_MINUS) { 2205 this.state = State.COMMENT_LESS_THAN_SIGN_BANG_DASH; 2206 } else { 2207 this.state = State.COMMENT; 2208 this._stateComment(cp); 2209 } 2210 } 2211 2212 // Comment less-than sign bang dash state 2213 //------------------------------------------------------------------ 2214 private _stateCommentLessThanSignBangDash(cp: number): void { 2215 if (cp === $.HYPHEN_MINUS) { 2216 this.state = State.COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH; 2217 } else { 2218 this.state = State.COMMENT_END_DASH; 2219 this._stateCommentEndDash(cp); 2220 } 2221 } 2222 2223 // Comment less-than sign bang dash dash state 2224 //------------------------------------------------------------------ 2225 private _stateCommentLessThanSignBangDashDash(cp: number): void { 2226 if (cp !== $.GREATER_THAN_SIGN && cp !== $.EOF) { 2227 this._err(ERR.nestedComment); 2228 } 2229 2230 this.state = State.COMMENT_END; 2231 this._stateCommentEnd(cp); 2232 } 2233 2234 // Comment end dash state 2235 //------------------------------------------------------------------ 2236 private _stateCommentEndDash(cp: number): void { 2237 const token = this.currentToken as CommentToken; 2238 switch (cp) { 2239 case $.HYPHEN_MINUS: { 2240 this.state = State.COMMENT_END; 2241 break; 2242 } 2243 case $.EOF: { 2244 this._err(ERR.eofInComment); 2245 this.emitCurrentComment(token); 2246 this._emitEOFToken(); 2247 break; 2248 } 2249 default: { 2250 token.data += '-'; 2251 this.state = State.COMMENT; 2252 this._stateComment(cp); 2253 } 2254 } 2255 } 2256 2257 // Comment end state 2258 //------------------------------------------------------------------ 2259 private _stateCommentEnd(cp: number): void { 2260 const token = this.currentToken as CommentToken; 2261 2262 switch (cp) { 2263 case $.GREATER_THAN_SIGN: { 2264 this.state = State.DATA; 2265 this.emitCurrentComment(token); 2266 break; 2267 } 2268 case $.EXCLAMATION_MARK: { 2269 this.state = State.COMMENT_END_BANG; 2270 break; 2271 } 2272 case $.HYPHEN_MINUS: { 2273 token.data += '-'; 2274 break; 2275 } 2276 case $.EOF: { 2277 this._err(ERR.eofInComment); 2278 this.emitCurrentComment(token); 2279 this._emitEOFToken(); 2280 break; 2281 } 2282 default: { 2283 token.data += '--'; 2284 this.state = State.COMMENT; 2285 this._stateComment(cp); 2286 } 2287 } 2288 } 2289 2290 // Comment end bang state 2291 //------------------------------------------------------------------ 2292 private _stateCommentEndBang(cp: number): void { 2293 const token = this.currentToken as CommentToken; 2294 2295 switch (cp) { 2296 case $.HYPHEN_MINUS: { 2297 token.data += '--!'; 2298 this.state = State.COMMENT_END_DASH; 2299 break; 2300 } 2301 case $.GREATER_THAN_SIGN: { 2302 this._err(ERR.incorrectlyClosedComment); 2303 this.state = State.DATA; 2304 this.emitCurrentComment(token); 2305 break; 2306 } 2307 case $.EOF: { 2308 this._err(ERR.eofInComment); 2309 this.emitCurrentComment(token); 2310 this._emitEOFToken(); 2311 break; 2312 } 2313 default: { 2314 token.data += '--!'; 2315 this.state = State.COMMENT; 2316 this._stateComment(cp); 2317 } 2318 } 2319 } 2320 2321 // DOCTYPE state 2322 //------------------------------------------------------------------ 2323 private _stateDoctype(cp: number): void { 2324 switch (cp) { 2325 case $.SPACE: 2326 case $.LINE_FEED: 2327 case $.TABULATION: 2328 case $.FORM_FEED: { 2329 this.state = State.BEFORE_DOCTYPE_NAME; 2330 break; 2331 } 2332 case $.GREATER_THAN_SIGN: { 2333 this.state = State.BEFORE_DOCTYPE_NAME; 2334 this._stateBeforeDoctypeName(cp); 2335 break; 2336 } 2337 case $.EOF: { 2338 this._err(ERR.eofInDoctype); 2339 this._createDoctypeToken(null); 2340 const token = this.currentToken as DoctypeToken; 2341 token.forceQuirks = true; 2342 this.emitCurrentDoctype(token); 2343 this._emitEOFToken(); 2344 break; 2345 } 2346 default: { 2347 this._err(ERR.missingWhitespaceBeforeDoctypeName); 2348 this.state = State.BEFORE_DOCTYPE_NAME; 2349 this._stateBeforeDoctypeName(cp); 2350 } 2351 } 2352 } 2353 2354 // Before DOCTYPE name state 2355 //------------------------------------------------------------------ 2356 private _stateBeforeDoctypeName(cp: number): void { 2357 if (isAsciiUpper(cp)) { 2358 this._createDoctypeToken(String.fromCharCode(toAsciiLower(cp))); 2359 this.state = State.DOCTYPE_NAME; 2360 } else 2361 switch (cp) { 2362 case $.SPACE: 2363 case $.LINE_FEED: 2364 case $.TABULATION: 2365 case $.FORM_FEED: { 2366 // Ignore whitespace 2367 break; 2368 } 2369 case $.NULL: { 2370 this._err(ERR.unexpectedNullCharacter); 2371 this._createDoctypeToken(REPLACEMENT_CHARACTER); 2372 this.state = State.DOCTYPE_NAME; 2373 break; 2374 } 2375 case $.GREATER_THAN_SIGN: { 2376 this._err(ERR.missingDoctypeName); 2377 this._createDoctypeToken(null); 2378 const token = this.currentToken as DoctypeToken; 2379 token.forceQuirks = true; 2380 this.emitCurrentDoctype(token); 2381 this.state = State.DATA; 2382 break; 2383 } 2384 case $.EOF: { 2385 this._err(ERR.eofInDoctype); 2386 this._createDoctypeToken(null); 2387 const token = this.currentToken as DoctypeToken; 2388 token.forceQuirks = true; 2389 this.emitCurrentDoctype(token); 2390 this._emitEOFToken(); 2391 break; 2392 } 2393 default: { 2394 this._createDoctypeToken(String.fromCodePoint(cp)); 2395 this.state = State.DOCTYPE_NAME; 2396 } 2397 } 2398 } 2399 2400 // DOCTYPE name state 2401 //------------------------------------------------------------------ 2402 private _stateDoctypeName(cp: number): void { 2403 const token = this.currentToken as DoctypeToken; 2404 2405 switch (cp) { 2406 case $.SPACE: 2407 case $.LINE_FEED: 2408 case $.TABULATION: 2409 case $.FORM_FEED: { 2410 this.state = State.AFTER_DOCTYPE_NAME; 2411 break; 2412 } 2413 case $.GREATER_THAN_SIGN: { 2414 this.state = State.DATA; 2415 this.emitCurrentDoctype(token); 2416 break; 2417 } 2418 case $.NULL: { 2419 this._err(ERR.unexpectedNullCharacter); 2420 token.name += REPLACEMENT_CHARACTER; 2421 break; 2422 } 2423 case $.EOF: { 2424 this._err(ERR.eofInDoctype); 2425 token.forceQuirks = true; 2426 this.emitCurrentDoctype(token); 2427 this._emitEOFToken(); 2428 break; 2429 } 2430 default: { 2431 token.name += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp); 2432 } 2433 } 2434 } 2435 2436 // After DOCTYPE name state 2437 //------------------------------------------------------------------ 2438 private _stateAfterDoctypeName(cp: number): void { 2439 const token = this.currentToken as DoctypeToken; 2440 2441 switch (cp) { 2442 case $.SPACE: 2443 case $.LINE_FEED: 2444 case $.TABULATION: 2445 case $.FORM_FEED: { 2446 // Ignore whitespace 2447 break; 2448 } 2449 case $.GREATER_THAN_SIGN: { 2450 this.state = State.DATA; 2451 this.emitCurrentDoctype(token); 2452 break; 2453 } 2454 case $.EOF: { 2455 this._err(ERR.eofInDoctype); 2456 token.forceQuirks = true; 2457 this.emitCurrentDoctype(token); 2458 this._emitEOFToken(); 2459 break; 2460 } 2461 default: { 2462 if (this._consumeSequenceIfMatch($$.PUBLIC, false)) { 2463 this.state = State.AFTER_DOCTYPE_PUBLIC_KEYWORD; 2464 } else if (this._consumeSequenceIfMatch($$.SYSTEM, false)) { 2465 this.state = State.AFTER_DOCTYPE_SYSTEM_KEYWORD; 2466 } 2467 //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup 2468 //results are no longer valid and we will need to start over. 2469 else if (!this._ensureHibernation()) { 2470 this._err(ERR.invalidCharacterSequenceAfterDoctypeName); 2471 token.forceQuirks = true; 2472 this.state = State.BOGUS_DOCTYPE; 2473 this._stateBogusDoctype(cp); 2474 } 2475 } 2476 } 2477 } 2478 2479 // After DOCTYPE public keyword state 2480 //------------------------------------------------------------------ 2481 private _stateAfterDoctypePublicKeyword(cp: number): void { 2482 const token = this.currentToken as DoctypeToken; 2483 2484 switch (cp) { 2485 case $.SPACE: 2486 case $.LINE_FEED: 2487 case $.TABULATION: 2488 case $.FORM_FEED: { 2489 this.state = State.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; 2490 break; 2491 } 2492 case $.QUOTATION_MARK: { 2493 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); 2494 token.publicId = ''; 2495 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; 2496 break; 2497 } 2498 case $.APOSTROPHE: { 2499 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); 2500 token.publicId = ''; 2501 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; 2502 break; 2503 } 2504 case $.GREATER_THAN_SIGN: { 2505 this._err(ERR.missingDoctypePublicIdentifier); 2506 token.forceQuirks = true; 2507 this.state = State.DATA; 2508 this.emitCurrentDoctype(token); 2509 break; 2510 } 2511 case $.EOF: { 2512 this._err(ERR.eofInDoctype); 2513 token.forceQuirks = true; 2514 this.emitCurrentDoctype(token); 2515 this._emitEOFToken(); 2516 break; 2517 } 2518 default: { 2519 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); 2520 token.forceQuirks = true; 2521 this.state = State.BOGUS_DOCTYPE; 2522 this._stateBogusDoctype(cp); 2523 } 2524 } 2525 } 2526 2527 // Before DOCTYPE public identifier state 2528 //------------------------------------------------------------------ 2529 private _stateBeforeDoctypePublicIdentifier(cp: number): void { 2530 const token = this.currentToken as DoctypeToken; 2531 2532 switch (cp) { 2533 case $.SPACE: 2534 case $.LINE_FEED: 2535 case $.TABULATION: 2536 case $.FORM_FEED: { 2537 // Ignore whitespace 2538 break; 2539 } 2540 case $.QUOTATION_MARK: { 2541 token.publicId = ''; 2542 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; 2543 break; 2544 } 2545 case $.APOSTROPHE: { 2546 token.publicId = ''; 2547 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; 2548 break; 2549 } 2550 case $.GREATER_THAN_SIGN: { 2551 this._err(ERR.missingDoctypePublicIdentifier); 2552 token.forceQuirks = true; 2553 this.state = State.DATA; 2554 this.emitCurrentDoctype(token); 2555 break; 2556 } 2557 case $.EOF: { 2558 this._err(ERR.eofInDoctype); 2559 token.forceQuirks = true; 2560 this.emitCurrentDoctype(token); 2561 this._emitEOFToken(); 2562 break; 2563 } 2564 default: { 2565 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); 2566 token.forceQuirks = true; 2567 this.state = State.BOGUS_DOCTYPE; 2568 this._stateBogusDoctype(cp); 2569 } 2570 } 2571 } 2572 2573 // DOCTYPE public identifier (double-quoted) state 2574 //------------------------------------------------------------------ 2575 private _stateDoctypePublicIdentifierDoubleQuoted(cp: number): void { 2576 const token = this.currentToken as DoctypeToken; 2577 2578 switch (cp) { 2579 case $.QUOTATION_MARK: { 2580 this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; 2581 break; 2582 } 2583 case $.NULL: { 2584 this._err(ERR.unexpectedNullCharacter); 2585 token.publicId += REPLACEMENT_CHARACTER; 2586 break; 2587 } 2588 case $.GREATER_THAN_SIGN: { 2589 this._err(ERR.abruptDoctypePublicIdentifier); 2590 token.forceQuirks = true; 2591 this.emitCurrentDoctype(token); 2592 this.state = State.DATA; 2593 break; 2594 } 2595 case $.EOF: { 2596 this._err(ERR.eofInDoctype); 2597 token.forceQuirks = true; 2598 this.emitCurrentDoctype(token); 2599 this._emitEOFToken(); 2600 break; 2601 } 2602 default: { 2603 token.publicId += String.fromCodePoint(cp); 2604 } 2605 } 2606 } 2607 2608 // DOCTYPE public identifier (single-quoted) state 2609 //------------------------------------------------------------------ 2610 private _stateDoctypePublicIdentifierSingleQuoted(cp: number): void { 2611 const token = this.currentToken as DoctypeToken; 2612 2613 switch (cp) { 2614 case $.APOSTROPHE: { 2615 this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; 2616 break; 2617 } 2618 case $.NULL: { 2619 this._err(ERR.unexpectedNullCharacter); 2620 token.publicId += REPLACEMENT_CHARACTER; 2621 break; 2622 } 2623 case $.GREATER_THAN_SIGN: { 2624 this._err(ERR.abruptDoctypePublicIdentifier); 2625 token.forceQuirks = true; 2626 this.emitCurrentDoctype(token); 2627 this.state = State.DATA; 2628 break; 2629 } 2630 case $.EOF: { 2631 this._err(ERR.eofInDoctype); 2632 token.forceQuirks = true; 2633 this.emitCurrentDoctype(token); 2634 this._emitEOFToken(); 2635 break; 2636 } 2637 default: { 2638 token.publicId += String.fromCodePoint(cp); 2639 } 2640 } 2641 } 2642 2643 // After DOCTYPE public identifier state 2644 //------------------------------------------------------------------ 2645 private _stateAfterDoctypePublicIdentifier(cp: number): void { 2646 const token = this.currentToken as DoctypeToken; 2647 2648 switch (cp) { 2649 case $.SPACE: 2650 case $.LINE_FEED: 2651 case $.TABULATION: 2652 case $.FORM_FEED: { 2653 this.state = State.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; 2654 break; 2655 } 2656 case $.GREATER_THAN_SIGN: { 2657 this.state = State.DATA; 2658 this.emitCurrentDoctype(token); 2659 break; 2660 } 2661 case $.QUOTATION_MARK: { 2662 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); 2663 token.systemId = ''; 2664 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2665 break; 2666 } 2667 case $.APOSTROPHE: { 2668 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); 2669 token.systemId = ''; 2670 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2671 break; 2672 } 2673 case $.EOF: { 2674 this._err(ERR.eofInDoctype); 2675 token.forceQuirks = true; 2676 this.emitCurrentDoctype(token); 2677 this._emitEOFToken(); 2678 break; 2679 } 2680 default: { 2681 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2682 token.forceQuirks = true; 2683 this.state = State.BOGUS_DOCTYPE; 2684 this._stateBogusDoctype(cp); 2685 } 2686 } 2687 } 2688 2689 // Between DOCTYPE public and system identifiers state 2690 //------------------------------------------------------------------ 2691 private _stateBetweenDoctypePublicAndSystemIdentifiers(cp: number): void { 2692 const token = this.currentToken as DoctypeToken; 2693 2694 switch (cp) { 2695 case $.SPACE: 2696 case $.LINE_FEED: 2697 case $.TABULATION: 2698 case $.FORM_FEED: { 2699 // Ignore whitespace 2700 break; 2701 } 2702 case $.GREATER_THAN_SIGN: { 2703 this.emitCurrentDoctype(token); 2704 this.state = State.DATA; 2705 break; 2706 } 2707 case $.QUOTATION_MARK: { 2708 token.systemId = ''; 2709 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2710 break; 2711 } 2712 case $.APOSTROPHE: { 2713 token.systemId = ''; 2714 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2715 break; 2716 } 2717 case $.EOF: { 2718 this._err(ERR.eofInDoctype); 2719 token.forceQuirks = true; 2720 this.emitCurrentDoctype(token); 2721 this._emitEOFToken(); 2722 break; 2723 } 2724 default: { 2725 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2726 token.forceQuirks = true; 2727 this.state = State.BOGUS_DOCTYPE; 2728 this._stateBogusDoctype(cp); 2729 } 2730 } 2731 } 2732 2733 // After DOCTYPE system keyword state 2734 //------------------------------------------------------------------ 2735 private _stateAfterDoctypeSystemKeyword(cp: number): void { 2736 const token = this.currentToken as DoctypeToken; 2737 2738 switch (cp) { 2739 case $.SPACE: 2740 case $.LINE_FEED: 2741 case $.TABULATION: 2742 case $.FORM_FEED: { 2743 this.state = State.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; 2744 break; 2745 } 2746 case $.QUOTATION_MARK: { 2747 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); 2748 token.systemId = ''; 2749 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2750 break; 2751 } 2752 case $.APOSTROPHE: { 2753 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); 2754 token.systemId = ''; 2755 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2756 break; 2757 } 2758 case $.GREATER_THAN_SIGN: { 2759 this._err(ERR.missingDoctypeSystemIdentifier); 2760 token.forceQuirks = true; 2761 this.state = State.DATA; 2762 this.emitCurrentDoctype(token); 2763 break; 2764 } 2765 case $.EOF: { 2766 this._err(ERR.eofInDoctype); 2767 token.forceQuirks = true; 2768 this.emitCurrentDoctype(token); 2769 this._emitEOFToken(); 2770 break; 2771 } 2772 default: { 2773 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2774 token.forceQuirks = true; 2775 this.state = State.BOGUS_DOCTYPE; 2776 this._stateBogusDoctype(cp); 2777 } 2778 } 2779 } 2780 2781 // Before DOCTYPE system identifier state 2782 //------------------------------------------------------------------ 2783 private _stateBeforeDoctypeSystemIdentifier(cp: number): void { 2784 const token = this.currentToken as DoctypeToken; 2785 2786 switch (cp) { 2787 case $.SPACE: 2788 case $.LINE_FEED: 2789 case $.TABULATION: 2790 case $.FORM_FEED: { 2791 // Ignore whitespace 2792 break; 2793 } 2794 case $.QUOTATION_MARK: { 2795 token.systemId = ''; 2796 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2797 break; 2798 } 2799 case $.APOSTROPHE: { 2800 token.systemId = ''; 2801 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2802 break; 2803 } 2804 case $.GREATER_THAN_SIGN: { 2805 this._err(ERR.missingDoctypeSystemIdentifier); 2806 token.forceQuirks = true; 2807 this.state = State.DATA; 2808 this.emitCurrentDoctype(token); 2809 break; 2810 } 2811 case $.EOF: { 2812 this._err(ERR.eofInDoctype); 2813 token.forceQuirks = true; 2814 this.emitCurrentDoctype(token); 2815 this._emitEOFToken(); 2816 break; 2817 } 2818 default: { 2819 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2820 token.forceQuirks = true; 2821 this.state = State.BOGUS_DOCTYPE; 2822 this._stateBogusDoctype(cp); 2823 } 2824 } 2825 } 2826 2827 // DOCTYPE system identifier (double-quoted) state 2828 //------------------------------------------------------------------ 2829 private _stateDoctypeSystemIdentifierDoubleQuoted(cp: number): void { 2830 const token = this.currentToken as DoctypeToken; 2831 2832 switch (cp) { 2833 case $.QUOTATION_MARK: { 2834 this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; 2835 break; 2836 } 2837 case $.NULL: { 2838 this._err(ERR.unexpectedNullCharacter); 2839 token.systemId += REPLACEMENT_CHARACTER; 2840 break; 2841 } 2842 case $.GREATER_THAN_SIGN: { 2843 this._err(ERR.abruptDoctypeSystemIdentifier); 2844 token.forceQuirks = true; 2845 this.emitCurrentDoctype(token); 2846 this.state = State.DATA; 2847 break; 2848 } 2849 case $.EOF: { 2850 this._err(ERR.eofInDoctype); 2851 token.forceQuirks = true; 2852 this.emitCurrentDoctype(token); 2853 this._emitEOFToken(); 2854 break; 2855 } 2856 default: { 2857 token.systemId += String.fromCodePoint(cp); 2858 } 2859 } 2860 } 2861 2862 // DOCTYPE system identifier (single-quoted) state 2863 //------------------------------------------------------------------ 2864 private _stateDoctypeSystemIdentifierSingleQuoted(cp: number): void { 2865 const token = this.currentToken as DoctypeToken; 2866 2867 switch (cp) { 2868 case $.APOSTROPHE: { 2869 this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; 2870 break; 2871 } 2872 case $.NULL: { 2873 this._err(ERR.unexpectedNullCharacter); 2874 token.systemId += REPLACEMENT_CHARACTER; 2875 break; 2876 } 2877 case $.GREATER_THAN_SIGN: { 2878 this._err(ERR.abruptDoctypeSystemIdentifier); 2879 token.forceQuirks = true; 2880 this.emitCurrentDoctype(token); 2881 this.state = State.DATA; 2882 break; 2883 } 2884 case $.EOF: { 2885 this._err(ERR.eofInDoctype); 2886 token.forceQuirks = true; 2887 this.emitCurrentDoctype(token); 2888 this._emitEOFToken(); 2889 break; 2890 } 2891 default: { 2892 token.systemId += String.fromCodePoint(cp); 2893 } 2894 } 2895 } 2896 2897 // After DOCTYPE system identifier state 2898 //------------------------------------------------------------------ 2899 private _stateAfterDoctypeSystemIdentifier(cp: number): void { 2900 const token = this.currentToken as DoctypeToken; 2901 2902 switch (cp) { 2903 case $.SPACE: 2904 case $.LINE_FEED: 2905 case $.TABULATION: 2906 case $.FORM_FEED: { 2907 // Ignore whitespace 2908 break; 2909 } 2910 case $.GREATER_THAN_SIGN: { 2911 this.emitCurrentDoctype(token); 2912 this.state = State.DATA; 2913 break; 2914 } 2915 case $.EOF: { 2916 this._err(ERR.eofInDoctype); 2917 token.forceQuirks = true; 2918 this.emitCurrentDoctype(token); 2919 this._emitEOFToken(); 2920 break; 2921 } 2922 default: { 2923 this._err(ERR.unexpectedCharacterAfterDoctypeSystemIdentifier); 2924 this.state = State.BOGUS_DOCTYPE; 2925 this._stateBogusDoctype(cp); 2926 } 2927 } 2928 } 2929 2930 // Bogus DOCTYPE state 2931 //------------------------------------------------------------------ 2932 private _stateBogusDoctype(cp: number): void { 2933 const token = this.currentToken as DoctypeToken; 2934 2935 switch (cp) { 2936 case $.GREATER_THAN_SIGN: { 2937 this.emitCurrentDoctype(token); 2938 this.state = State.DATA; 2939 break; 2940 } 2941 case $.NULL: { 2942 this._err(ERR.unexpectedNullCharacter); 2943 break; 2944 } 2945 case $.EOF: { 2946 this.emitCurrentDoctype(token); 2947 this._emitEOFToken(); 2948 break; 2949 } 2950 default: 2951 // Do nothing 2952 } 2953 } 2954 2955 // CDATA section state 2956 //------------------------------------------------------------------ 2957 private _stateCdataSection(cp: number): void { 2958 switch (cp) { 2959 case $.RIGHT_SQUARE_BRACKET: { 2960 this.state = State.CDATA_SECTION_BRACKET; 2961 break; 2962 } 2963 case $.EOF: { 2964 this._err(ERR.eofInCdata); 2965 this._emitEOFToken(); 2966 break; 2967 } 2968 default: { 2969 this._emitCodePoint(cp); 2970 } 2971 } 2972 } 2973 2974 // CDATA section bracket state 2975 //------------------------------------------------------------------ 2976 private _stateCdataSectionBracket(cp: number): void { 2977 if (cp === $.RIGHT_SQUARE_BRACKET) { 2978 this.state = State.CDATA_SECTION_END; 2979 } else { 2980 this._emitChars(']'); 2981 this.state = State.CDATA_SECTION; 2982 this._stateCdataSection(cp); 2983 } 2984 } 2985 2986 // CDATA section end state 2987 //------------------------------------------------------------------ 2988 private _stateCdataSectionEnd(cp: number): void { 2989 switch (cp) { 2990 case $.GREATER_THAN_SIGN: { 2991 this.state = State.DATA; 2992 break; 2993 } 2994 case $.RIGHT_SQUARE_BRACKET: { 2995 this._emitChars(']'); 2996 break; 2997 } 2998 default: { 2999 this._emitChars(']]'); 3000 this.state = State.CDATA_SECTION; 3001 this._stateCdataSection(cp); 3002 } 3003 } 3004 } 3005 3006 // Character reference state 3007 //------------------------------------------------------------------ 3008 private _stateCharacterReference(cp: number): void { 3009 if (cp === $.NUMBER_SIGN) { 3010 this.state = State.NUMERIC_CHARACTER_REFERENCE; 3011 } else if (isAsciiAlphaNumeric(cp)) { 3012 this.state = State.NAMED_CHARACTER_REFERENCE; 3013 this._stateNamedCharacterReference(cp); 3014 } else { 3015 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3016 this._reconsumeInState(this.returnState, cp); 3017 } 3018 } 3019 3020 // Named character reference state 3021 //------------------------------------------------------------------ 3022 private _stateNamedCharacterReference(cp: number): void { 3023 const matchResult = this._matchNamedCharacterReference(cp); 3024 3025 //NOTE: Matching can be abrupted by hibernation. In that case, match 3026 //results are no longer valid and we will need to start over. 3027 if (this._ensureHibernation()) { 3028 // Stay in the state, try again. 3029 } else if (matchResult) { 3030 for (let i = 0; i < matchResult.length; i++) { 3031 this._flushCodePointConsumedAsCharacterReference(matchResult[i]); 3032 } 3033 this.state = this.returnState; 3034 } else { 3035 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3036 this.state = State.AMBIGUOUS_AMPERSAND; 3037 } 3038 } 3039 3040 // Ambiguos ampersand state 3041 //------------------------------------------------------------------ 3042 private _stateAmbiguousAmpersand(cp: number): void { 3043 if (isAsciiAlphaNumeric(cp)) { 3044 this._flushCodePointConsumedAsCharacterReference(cp); 3045 } else { 3046 if (cp === $.SEMICOLON) { 3047 this._err(ERR.unknownNamedCharacterReference); 3048 } 3049 3050 this._reconsumeInState(this.returnState, cp); 3051 } 3052 } 3053 3054 // Numeric character reference state 3055 //------------------------------------------------------------------ 3056 private _stateNumericCharacterReference(cp: number): void { 3057 this.charRefCode = 0; 3058 3059 if (cp === $.LATIN_SMALL_X || cp === $.LATIN_CAPITAL_X) { 3060 this.state = State.HEXADEMICAL_CHARACTER_REFERENCE_START; 3061 } 3062 // Inlined decimal character reference start state 3063 else if (isAsciiDigit(cp)) { 3064 this.state = State.DECIMAL_CHARACTER_REFERENCE; 3065 this._stateDecimalCharacterReference(cp); 3066 } else { 3067 this._err(ERR.absenceOfDigitsInNumericCharacterReference); 3068 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3069 this._flushCodePointConsumedAsCharacterReference($.NUMBER_SIGN); 3070 this._reconsumeInState(this.returnState, cp); 3071 } 3072 } 3073 3074 // Hexademical character reference start state 3075 //------------------------------------------------------------------ 3076 private _stateHexademicalCharacterReferenceStart(cp: number): void { 3077 if (isAsciiHexDigit(cp)) { 3078 this.state = State.HEXADEMICAL_CHARACTER_REFERENCE; 3079 this._stateHexademicalCharacterReference(cp); 3080 } else { 3081 this._err(ERR.absenceOfDigitsInNumericCharacterReference); 3082 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3083 this._flushCodePointConsumedAsCharacterReference($.NUMBER_SIGN); 3084 this._unconsume(2); 3085 this.state = this.returnState; 3086 } 3087 } 3088 3089 // Hexademical character reference state 3090 //------------------------------------------------------------------ 3091 private _stateHexademicalCharacterReference(cp: number): void { 3092 if (isAsciiUpperHexDigit(cp)) { 3093 this.charRefCode = this.charRefCode * 16 + cp - 0x37; 3094 } else if (isAsciiLowerHexDigit(cp)) { 3095 this.charRefCode = this.charRefCode * 16 + cp - 0x57; 3096 } else if (isAsciiDigit(cp)) { 3097 this.charRefCode = this.charRefCode * 16 + cp - 0x30; 3098 } else if (cp === $.SEMICOLON) { 3099 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3100 } else { 3101 this._err(ERR.missingSemicolonAfterCharacterReference); 3102 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3103 this._stateNumericCharacterReferenceEnd(cp); 3104 } 3105 } 3106 3107 // Decimal character reference state 3108 //------------------------------------------------------------------ 3109 private _stateDecimalCharacterReference(cp: number): void { 3110 if (isAsciiDigit(cp)) { 3111 this.charRefCode = this.charRefCode * 10 + cp - 0x30; 3112 } else if (cp === $.SEMICOLON) { 3113 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3114 } else { 3115 this._err(ERR.missingSemicolonAfterCharacterReference); 3116 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3117 this._stateNumericCharacterReferenceEnd(cp); 3118 } 3119 } 3120 3121 // Numeric character reference end state 3122 //------------------------------------------------------------------ 3123 private _stateNumericCharacterReferenceEnd(cp: number): void { 3124 if (this.charRefCode === $.NULL) { 3125 this._err(ERR.nullCharacterReference); 3126 this.charRefCode = $.REPLACEMENT_CHARACTER; 3127 } else if (this.charRefCode > 0x10_ff_ff) { 3128 this._err(ERR.characterReferenceOutsideUnicodeRange); 3129 this.charRefCode = $.REPLACEMENT_CHARACTER; 3130 } else if (isSurrogate(this.charRefCode)) { 3131 this._err(ERR.surrogateCharacterReference); 3132 this.charRefCode = $.REPLACEMENT_CHARACTER; 3133 } else if (isUndefinedCodePoint(this.charRefCode)) { 3134 this._err(ERR.noncharacterCharacterReference); 3135 } else if (isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) { 3136 this._err(ERR.controlCharacterReference); 3137 3138 const replacement = C1_CONTROLS_REFERENCE_REPLACEMENTS.get(this.charRefCode); 3139 3140 if (replacement !== undefined) { 3141 this.charRefCode = replacement; 3142 } 3143 } 3144 3145 this._flushCodePointConsumedAsCharacterReference(this.charRefCode); 3146 this._reconsumeInState(this.returnState, cp); 3147 } 3148} 3149 3150function checkselfClosingNode(parse: Tokenizer, token: TagToken) { 3151 const tagName: string = (token.tagName || "").toLowerCase(); 3152 const selfClosing: boolean = token.selfClosing; 3153 const flag: boolean = parse.validator.isSupportedSelfClosing(tagName); 3154 if (parse.nodeInfo.tn && tagName && !parse.nodeInfo.sc) { 3155 const loc: string = 3156 String(token.location?.startLine) + ',' + String(token.location?.startCol); 3157 if ( 3158 !flag || 3159 (loc !== parse.nodeInfo.pos && token.type === TokenType.START_TAG) 3160 ) { 3161 const posInfo: string = parse.nodeInfo.pos; 3162 const posArr: string[] = posInfo.split(','); 3163 parse.compileResult.log.push({ 3164 line: Number(posArr[0]) || 1, 3165 column: Number(posArr[1]) || 1, 3166 reason: 'ERROR: tag `' + parse.nodeInfo.tn + '` must be closed, please follow norm', 3167 }); 3168 parse.nodeInfo = { tn: '', sc: false, pos: '' }; 3169 } 3170 } 3171 if (tagName && flag) { 3172 if (token.type === TokenType.START_TAG && !selfClosing) { 3173 parse.nodeInfo.tn = tagName; 3174 parse.nodeInfo.sc = false; 3175 parse.nodeInfo.pos = 3176 String(token.location?.startLine) + ',' + String(token.location?.startCol); 3177 } 3178 if ( 3179 token.type === TokenType.END_TAG && tagName === parse.nodeInfo.tn 3180 ) { 3181 parse.nodeInfo.sc = true; 3182 } 3183 } 3184 if (!flag && selfClosing && token.type === TokenType.START_TAG) { 3185 parse.compileResult.log.push({ 3186 line: token.location?.startLine || 1, 3187 column: token.location?.startCol || 1, 3188 reason: "ERROR: tag `" + tagName + "` can not use selfClosing", 3189 }); 3190 } 3191}