1import { Preprocessor } from './preprocessor.js'; 2import { 3 CODE_POINTS as $, 4 SEQUENCES as $$, 5 REPLACEMENT_CHARACTER, 6 isSurrogate, 7 isUndefinedCodePoint, 8 isControlCodePoint, 9} from '../common/unicode.js'; 10import { 11 TokenType, 12 getTokenAttr, 13 type Token, 14 type CharacterToken, 15 type DoctypeToken, 16 type TagToken, 17 type EOFToken, 18 type CommentToken, 19 type Attribute, 20 type Location, 21} from '../common/token.js'; 22import { htmlDecodeTree, BinTrieFlags, determineBranch } from 'entities/lib/decode.js'; 23import { ERR, type ParserErrorHandler } from '../common/error-codes.js'; 24import { TAG_ID, getTagID } from '../common/html.js'; 25 26//C1 Unicode control character reference replacements 27const C1_CONTROLS_REFERENCE_REPLACEMENTS = new Map([ 28 [0x80, 0x20_ac], 29 [0x82, 0x20_1a], 30 [0x83, 0x01_92], 31 [0x84, 0x20_1e], 32 [0x85, 0x20_26], 33 [0x86, 0x20_20], 34 [0x87, 0x20_21], 35 [0x88, 0x02_c6], 36 [0x89, 0x20_30], 37 [0x8a, 0x01_60], 38 [0x8b, 0x20_39], 39 [0x8c, 0x01_52], 40 [0x8e, 0x01_7d], 41 [0x91, 0x20_18], 42 [0x92, 0x20_19], 43 [0x93, 0x20_1c], 44 [0x94, 0x20_1d], 45 [0x95, 0x20_22], 46 [0x96, 0x20_13], 47 [0x97, 0x20_14], 48 [0x98, 0x02_dc], 49 [0x99, 0x21_22], 50 [0x9a, 0x01_61], 51 [0x9b, 0x20_3a], 52 [0x9c, 0x01_53], 53 [0x9e, 0x01_7e], 54 [0x9f, 0x01_78], 55]); 56 57//States 58const enum State { 59 DATA, 60 RCDATA, 61 RAWTEXT, 62 SCRIPT_DATA, 63 PLAINTEXT, 64 TAG_OPEN, 65 END_TAG_OPEN, 66 TAG_NAME, 67 RCDATA_LESS_THAN_SIGN, 68 RCDATA_END_TAG_OPEN, 69 RCDATA_END_TAG_NAME, 70 RAWTEXT_LESS_THAN_SIGN, 71 RAWTEXT_END_TAG_OPEN, 72 RAWTEXT_END_TAG_NAME, 73 SCRIPT_DATA_LESS_THAN_SIGN, 74 SCRIPT_DATA_END_TAG_OPEN, 75 SCRIPT_DATA_END_TAG_NAME, 76 SCRIPT_DATA_ESCAPE_START, 77 SCRIPT_DATA_ESCAPE_START_DASH, 78 SCRIPT_DATA_ESCAPED, 79 SCRIPT_DATA_ESCAPED_DASH, 80 SCRIPT_DATA_ESCAPED_DASH_DASH, 81 SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, 82 SCRIPT_DATA_ESCAPED_END_TAG_OPEN, 83 SCRIPT_DATA_ESCAPED_END_TAG_NAME, 84 SCRIPT_DATA_DOUBLE_ESCAPE_START, 85 SCRIPT_DATA_DOUBLE_ESCAPED, 86 SCRIPT_DATA_DOUBLE_ESCAPED_DASH, 87 SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, 88 SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, 89 SCRIPT_DATA_DOUBLE_ESCAPE_END, 90 BEFORE_ATTRIBUTE_NAME, 91 ATTRIBUTE_NAME, 92 AFTER_ATTRIBUTE_NAME, 93 BEFORE_ATTRIBUTE_VALUE, 94 ATTRIBUTE_VALUE_DOUBLE_QUOTED, 95 ATTRIBUTE_VALUE_SINGLE_QUOTED, 96 ATTRIBUTE_VALUE_UNQUOTED, 97 AFTER_ATTRIBUTE_VALUE_QUOTED, 98 SELF_CLOSING_START_TAG, 99 BOGUS_COMMENT, 100 MARKUP_DECLARATION_OPEN, 101 COMMENT_START, 102 COMMENT_START_DASH, 103 COMMENT, 104 COMMENT_LESS_THAN_SIGN, 105 COMMENT_LESS_THAN_SIGN_BANG, 106 COMMENT_LESS_THAN_SIGN_BANG_DASH, 107 COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, 108 COMMENT_END_DASH, 109 COMMENT_END, 110 COMMENT_END_BANG, 111 DOCTYPE, 112 BEFORE_DOCTYPE_NAME, 113 DOCTYPE_NAME, 114 AFTER_DOCTYPE_NAME, 115 AFTER_DOCTYPE_PUBLIC_KEYWORD, 116 BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, 117 DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, 118 DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, 119 AFTER_DOCTYPE_PUBLIC_IDENTIFIER, 120 BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, 121 AFTER_DOCTYPE_SYSTEM_KEYWORD, 122 BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, 123 DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, 124 DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, 125 AFTER_DOCTYPE_SYSTEM_IDENTIFIER, 126 BOGUS_DOCTYPE, 127 CDATA_SECTION, 128 CDATA_SECTION_BRACKET, 129 CDATA_SECTION_END, 130 CHARACTER_REFERENCE, 131 NAMED_CHARACTER_REFERENCE, 132 AMBIGUOUS_AMPERSAND, 133 NUMERIC_CHARACTER_REFERENCE, 134 HEXADEMICAL_CHARACTER_REFERENCE_START, 135 DECIMAL_CHARACTER_REFERENCE_START, 136 HEXADEMICAL_CHARACTER_REFERENCE, 137 DECIMAL_CHARACTER_REFERENCE, 138 NUMERIC_CHARACTER_REFERENCE_END, 139} 140 141//Tokenizer initial states for different modes 142export const TokenizerMode = { 143 DATA: State.DATA, 144 RCDATA: State.RCDATA, 145 RAWTEXT: State.RAWTEXT, 146 SCRIPT_DATA: State.SCRIPT_DATA, 147 PLAINTEXT: State.PLAINTEXT, 148 CDATA_SECTION: State.CDATA_SECTION, 149} as const; 150 151//Utils 152 153//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline 154//this functions if they will be situated in another module due to context switch. 155//Always perform inlining check before modifying this functions ('node --trace-inlining'). 156 157function isAsciiDigit(cp: number): boolean { 158 return cp >= $.DIGIT_0 && cp <= $.DIGIT_9; 159} 160 161function isAsciiUpper(cp: number): boolean { 162 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z; 163} 164 165function isAsciiLower(cp: number): boolean { 166 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z; 167} 168 169function isAsciiLetter(cp: number): boolean { 170 return isAsciiLower(cp) || isAsciiUpper(cp); 171} 172 173function isAsciiAlphaNumeric(cp: number): boolean { 174 return isAsciiLetter(cp) || isAsciiDigit(cp); 175} 176 177function isAsciiUpperHexDigit(cp: number): boolean { 178 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F; 179} 180 181function isAsciiLowerHexDigit(cp: number): boolean { 182 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F; 183} 184 185function isAsciiHexDigit(cp: number): boolean { 186 return isAsciiDigit(cp) || isAsciiUpperHexDigit(cp) || isAsciiLowerHexDigit(cp); 187} 188 189function toAsciiLower(cp: number): number { 190 return cp + 0x00_20; 191} 192 193function isWhitespace(cp: number): boolean { 194 return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED; 195} 196 197function isEntityInAttributeInvalidEnd(nextCp: number): boolean { 198 return nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp); 199} 200 201function isScriptDataDoubleEscapeSequenceEnd(cp: number): boolean { 202 return isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN; 203} 204 205const componentValidator = { isSupportedSelfClosing: () => false }; 206 207interface Validator { 208 isSupportedSelfClosing(tagName: string): boolean; 209} 210 211interface CompileResult { 212 jsonTemplate: {}, 213 deps: [], 214 log: { 215 line: number, 216 column: number, 217 reason: string 218 }[] 219} 220 221interface NodeInfo { 222 tn: string, 223 sc: boolean, 224 pos: string 225} 226 227export interface TokenizerOptions { 228 componentValidator?: Validator; 229 compileResult?: CompileResult; 230 sourceCodeLocationInfo?: boolean; 231} 232 233export interface TokenHandler { 234 onComment(token: CommentToken): void; 235 onDoctype(token: DoctypeToken): void; 236 onStartTag(token: TagToken): void; 237 onEndTag(token: TagToken): void; 238 onEof(token: EOFToken): void; 239 onCharacter(token: CharacterToken): void; 240 onNullCharacter(token: CharacterToken): void; 241 onWhitespaceCharacter(token: CharacterToken): void; 242 243 onParseError?: ParserErrorHandler | null; 244} 245 246//Tokenizer 247export class Tokenizer { 248 public preprocessor: Preprocessor; 249 250 private paused = false; 251 /** Ensures that the parsing loop isn't run multiple times at once. */ 252 private inLoop = false; 253 254 /** 255 * Indicates that the current adjusted node exists, is not an element in the HTML namespace, 256 * and that it is not an integration point for either MathML or HTML. 257 * 258 * @see {@link https://html.spec.whatwg.org/multipage/parsing.html#tree-construction} 259 */ 260 public inForeignNode = false; 261 public lastStartTagName = ''; 262 public active = false; 263 264 public nodeInfo: NodeInfo = { tn: '', sc: false, pos: '' }; 265 public validator: Validator = componentValidator; 266 public compileResult: CompileResult = { jsonTemplate: {}, deps: [], log: [] }; 267 268 public state = State.DATA; 269 private returnState = State.DATA; 270 271 private charRefCode = -1; 272 273 private consumedAfterSnapshot = -1; 274 275 private currentLocation: Location | null; 276 private currentCharacterToken: CharacterToken | null = null; 277 private currentToken: Token | null = null; 278 private currentAttr: Attribute = { name: '', value: '' }; 279 280 constructor(private options: TokenizerOptions, private handler: TokenHandler) { 281 this.preprocessor = new Preprocessor(handler); 282 this.currentLocation = this.getCurrentLocation(-1); 283 284 if(options.componentValidator){ 285 this.validator = options.componentValidator; 286 } 287 if(options.compileResult){ 288 this.compileResult = options.compileResult; 289 } 290 } 291 292 //Errors 293 private _err(code: ERR): void { 294 this.handler.onParseError?.(this.preprocessor.getError(code)); 295 } 296 297 // NOTE: `offset` may never run across line boundaries. 298 private getCurrentLocation(offset: number): Location | null { 299 if (!this.options.sourceCodeLocationInfo) { 300 return null; 301 } 302 303 return { 304 startLine: this.preprocessor.line, 305 startCol: this.preprocessor.col - offset, 306 startOffset: this.preprocessor.offset - offset, 307 endLine: -1, 308 endCol: -1, 309 endOffset: -1, 310 }; 311 } 312 313 private _runParsingLoop(): void { 314 if (this.inLoop) return; 315 316 this.inLoop = true; 317 318 while (this.active && !this.paused) { 319 this.consumedAfterSnapshot = 0; 320 321 const cp = this._consume(); 322 323 if (!this._ensureHibernation()) { 324 this._callState(cp); 325 } 326 } 327 328 this.inLoop = false; 329 } 330 331 //API 332 public pause(): void { 333 this.paused = true; 334 } 335 336 public resume(writeCallback?: () => void): void { 337 if (!this.paused) { 338 throw new Error('Parser was already resumed'); 339 } 340 341 this.paused = false; 342 343 // Necessary for synchronous resume. 344 if (this.inLoop) return; 345 346 this._runParsingLoop(); 347 348 if (!this.paused) { 349 writeCallback?.(); 350 } 351 } 352 353 public write(chunk: string, isLastChunk: boolean, writeCallback?: () => void): void { 354 this.active = true; 355 this.preprocessor.write(chunk, isLastChunk); 356 this._runParsingLoop(); 357 358 if (!this.paused) { 359 writeCallback?.(); 360 } 361 } 362 363 public insertHtmlAtCurrentPos(chunk: string): void { 364 this.active = true; 365 this.preprocessor.insertHtmlAtCurrentPos(chunk); 366 this._runParsingLoop(); 367 } 368 369 //Hibernation 370 private _ensureHibernation(): boolean { 371 if (this.preprocessor.endOfChunkHit) { 372 this._unconsume(this.consumedAfterSnapshot); 373 this.active = false; 374 375 return true; 376 } 377 378 return false; 379 } 380 381 //Consumption 382 private _consume(): number { 383 this.consumedAfterSnapshot++; 384 return this.preprocessor.advance(); 385 } 386 387 private _unconsume(count: number): void { 388 this.consumedAfterSnapshot -= count; 389 this.preprocessor.retreat(count); 390 } 391 392 private _reconsumeInState(state: State): void { 393 this.state = state; 394 this._unconsume(1); 395 } 396 397 private _advanceBy(count: number): void { 398 this.consumedAfterSnapshot += count; 399 for (let i = 0; i < count; i++) { 400 this.preprocessor.advance(); 401 } 402 } 403 404 private _consumeSequenceIfMatch(pattern: string, caseSensitive: boolean): boolean { 405 if (this.preprocessor.startsWith(pattern, caseSensitive)) { 406 // We will already have consumed one character before calling this method. 407 this._advanceBy(pattern.length - 1); 408 return true; 409 } 410 return false; 411 } 412 413 //Token creation 414 private _createStartTagToken(): void { 415 this.currentToken = { 416 type: TokenType.START_TAG, 417 tagName: '', 418 tagID: TAG_ID.UNKNOWN, 419 selfClosing: false, 420 ackSelfClosing: false, 421 attrs: [], 422 location: this.getCurrentLocation(1), 423 }; 424 } 425 426 private _createEndTagToken(): void { 427 this.currentToken = { 428 type: TokenType.END_TAG, 429 tagName: '', 430 tagID: TAG_ID.UNKNOWN, 431 selfClosing: false, 432 ackSelfClosing: false, 433 attrs: [], 434 location: this.getCurrentLocation(2), 435 }; 436 } 437 438 private _createCommentToken(offset: number): void { 439 this.currentToken = { 440 type: TokenType.COMMENT, 441 data: '', 442 location: this.getCurrentLocation(offset), 443 }; 444 } 445 446 private _createDoctypeToken(initialName: string | null): void { 447 this.currentToken = { 448 type: TokenType.DOCTYPE, 449 name: initialName, 450 forceQuirks: false, 451 publicId: null, 452 systemId: null, 453 location: this.currentLocation, 454 }; 455 } 456 457 private _createCharacterToken(type: CharacterToken['type'], chars: string): void { 458 this.currentCharacterToken = { 459 type, 460 chars, 461 location: this.currentLocation, 462 }; 463 } 464 465 //Tag attributes 466 private _createAttr(attrNameFirstCh: string): void { 467 this.currentAttr = { 468 name: attrNameFirstCh, 469 value: '', 470 }; 471 this.currentLocation = this.getCurrentLocation(0); 472 } 473 474 private _leaveAttrName(): void { 475 const token = this.currentToken as TagToken; 476 477 if (getTokenAttr(token, this.currentAttr.name) === null) { 478 token.attrs.push(this.currentAttr); 479 480 if (token.location && this.currentLocation) { 481 const attrLocations = (token.location.attrs ??= Object.create(null)); 482 attrLocations[this.currentAttr.name] = this.currentLocation; 483 484 // Set end location 485 this._leaveAttrValue(); 486 } 487 } else { 488 this._err(ERR.duplicateAttribute); 489 } 490 } 491 492 private _leaveAttrValue(): void { 493 if (this.currentLocation) { 494 this.currentLocation.endLine = this.preprocessor.line; 495 this.currentLocation.endCol = this.preprocessor.col; 496 this.currentLocation.endOffset = this.preprocessor.offset; 497 } 498 } 499 500 //Token emission 501 private prepareToken(ct: Token): void { 502 this._emitCurrentCharacterToken(ct.location); 503 this.currentToken = null; 504 505 if (ct.location) { 506 ct.location.endLine = this.preprocessor.line; 507 ct.location.endCol = this.preprocessor.col + 1; 508 ct.location.endOffset = this.preprocessor.offset + 1; 509 } 510 511 this.currentLocation = this.getCurrentLocation(-1); 512 } 513 514 private emitCurrentTagToken(): void { 515 const ct = this.currentToken as TagToken; 516 517 checkselfClosingNode(this, ct); 518 519 this.prepareToken(ct); 520 521 ct.tagID = getTagID(ct.tagName); 522 523 if (ct.type === TokenType.START_TAG) { 524 this.lastStartTagName = ct.tagName; 525 this.handler.onStartTag(ct); 526 } else { 527 if (ct.attrs.length > 0) { 528 this._err(ERR.endTagWithAttributes); 529 } 530 531 if (ct.selfClosing) { 532 this._err(ERR.endTagWithTrailingSolidus); 533 } 534 535 this.handler.onEndTag(ct); 536 } 537 538 this.preprocessor.dropParsedChunk(); 539 } 540 541 private emitCurrentComment(ct: CommentToken): void { 542 this.prepareToken(ct); 543 this.handler.onComment(ct); 544 545 this.preprocessor.dropParsedChunk(); 546 } 547 548 private emitCurrentDoctype(ct: DoctypeToken): void { 549 this.prepareToken(ct); 550 this.handler.onDoctype(ct); 551 552 this.preprocessor.dropParsedChunk(); 553 } 554 555 private _emitCurrentCharacterToken(nextLocation: Location | null): void { 556 if (this.currentCharacterToken) { 557 //NOTE: if we have a pending character token, make it's end location equal to the 558 //current token's start location. 559 if (nextLocation && this.currentCharacterToken.location) { 560 this.currentCharacterToken.location.endLine = nextLocation.startLine; 561 this.currentCharacterToken.location.endCol = nextLocation.startCol; 562 this.currentCharacterToken.location.endOffset = nextLocation.startOffset; 563 } 564 565 switch (this.currentCharacterToken.type) { 566 case TokenType.CHARACTER: { 567 this.handler.onCharacter(this.currentCharacterToken); 568 break; 569 } 570 case TokenType.NULL_CHARACTER: { 571 this.handler.onNullCharacter(this.currentCharacterToken); 572 break; 573 } 574 case TokenType.WHITESPACE_CHARACTER: { 575 this.handler.onWhitespaceCharacter(this.currentCharacterToken); 576 break; 577 } 578 } 579 580 this.currentCharacterToken = null; 581 } 582 } 583 584 private _emitEOFToken(): void { 585 const location = this.getCurrentLocation(0); 586 587 if (location) { 588 location.endLine = location.startLine; 589 location.endCol = location.startCol; 590 location.endOffset = location.startOffset; 591 } 592 593 this._emitCurrentCharacterToken(location); 594 this.handler.onEof({ type: TokenType.EOF, location }); 595 this.active = false; 596 } 597 598 //Characters emission 599 600 //OPTIMIZATION: specification uses only one type of character tokens (one token per character). 601 //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters. 602 //If we have a sequence of characters that belong to the same group, the parser can process it 603 //as a single solid character token. 604 //So, there are 3 types of character tokens in parse5: 605 //1)TokenType.NULL_CHARACTER - \u0000-character sequences (e.g. '\u0000\u0000\u0000') 606 //2)TokenType.WHITESPACE_CHARACTER - any whitespace/new-line character sequences (e.g. '\n \r\t \f') 607 //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') 608 private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void { 609 if (this.currentCharacterToken) { 610 if (this.currentCharacterToken.type !== type) { 611 this.currentLocation = this.getCurrentLocation(0); 612 this._emitCurrentCharacterToken(this.currentLocation); 613 this.preprocessor.dropParsedChunk(); 614 } else { 615 this.currentCharacterToken.chars += ch; 616 return; 617 } 618 } 619 620 this._createCharacterToken(type, ch); 621 } 622 623 private _emitCodePoint(cp: number): void { 624 let type = TokenType.CHARACTER; 625 626 if (isWhitespace(cp)) { 627 type = TokenType.WHITESPACE_CHARACTER; 628 } else if (cp === $.NULL) { 629 type = TokenType.NULL_CHARACTER; 630 } 631 632 this._appendCharToCurrentCharacterToken(type, String.fromCodePoint(cp)); 633 } 634 635 //NOTE: used when we emit characters explicitly. 636 //This is always for non-whitespace and non-null characters, which allows us to avoid additional checks. 637 private _emitChars(ch: string): void { 638 this._appendCharToCurrentCharacterToken(TokenType.CHARACTER, ch); 639 } 640 641 // Character reference helpers 642 private _matchNamedCharacterReference(cp: number): number[] | null { 643 let result: number[] | null = null; 644 let excess = 0; 645 let withoutSemicolon = false; 646 647 for (let i = 0, current = htmlDecodeTree[0]; i >= 0; cp = this._consume()) { 648 i = determineBranch(htmlDecodeTree, current, i + 1, cp); 649 650 if (i < 0) break; 651 652 excess += 1; 653 654 current = htmlDecodeTree[i]; 655 656 const masked = current & BinTrieFlags.VALUE_LENGTH; 657 658 // If the branch is a value, store it and continue 659 if (masked) { 660 // The mask is the number of bytes of the value, including the current byte. 661 const valueLength = (masked >> 14) - 1; 662 663 // Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error. 664 // See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state 665 if ( 666 cp !== $.SEMICOLON && 667 this._isCharacterReferenceInAttribute() && 668 isEntityInAttributeInvalidEnd(this.preprocessor.peek(1)) 669 ) { 670 //NOTE: we don't flush all consumed code points here, and instead switch back to the original state after 671 //emitting an ampersand. This is fine, as alphanumeric characters won't be parsed differently in attributes. 672 result = [$.AMPERSAND]; 673 674 // Skip over the value. 675 i += valueLength; 676 } else { 677 // If this is a surrogate pair, consume the next two bytes. 678 result = 679 valueLength === 0 680 ? [htmlDecodeTree[i] & ~BinTrieFlags.VALUE_LENGTH] 681 : valueLength === 1 682 ? [htmlDecodeTree[++i]] 683 : [htmlDecodeTree[++i], htmlDecodeTree[++i]]; 684 excess = 0; 685 withoutSemicolon = cp !== $.SEMICOLON; 686 } 687 688 if (valueLength === 0) { 689 // If the value is zero-length, we're done. 690 this._consume(); 691 break; 692 } 693 } 694 } 695 696 this._unconsume(excess); 697 698 if (withoutSemicolon && !this.preprocessor.endOfChunkHit) { 699 this._err(ERR.missingSemicolonAfterCharacterReference); 700 } 701 702 // We want to emit the error above on the code point after the entity. 703 // We always consume one code point too many in the loop, and we wait to 704 // unconsume it until after the error is emitted. 705 this._unconsume(1); 706 707 return result; 708 } 709 710 private _isCharacterReferenceInAttribute(): boolean { 711 return ( 712 this.returnState === State.ATTRIBUTE_VALUE_DOUBLE_QUOTED || 713 this.returnState === State.ATTRIBUTE_VALUE_SINGLE_QUOTED || 714 this.returnState === State.ATTRIBUTE_VALUE_UNQUOTED 715 ); 716 } 717 718 private _flushCodePointConsumedAsCharacterReference(cp: number): void { 719 if (this._isCharacterReferenceInAttribute()) { 720 this.currentAttr.value += String.fromCodePoint(cp); 721 } else { 722 this._emitCodePoint(cp); 723 } 724 } 725 726 // Calling states this way turns out to be much faster than any other approach. 727 private _callState(cp: number): void { 728 switch (this.state) { 729 case State.DATA: { 730 this._stateData(cp); 731 break; 732 } 733 case State.RCDATA: { 734 this._stateRcdata(cp); 735 break; 736 } 737 case State.RAWTEXT: { 738 this._stateRawtext(cp); 739 break; 740 } 741 case State.SCRIPT_DATA: { 742 this._stateScriptData(cp); 743 break; 744 } 745 case State.PLAINTEXT: { 746 this._statePlaintext(cp); 747 break; 748 } 749 case State.TAG_OPEN: { 750 this._stateTagOpen(cp); 751 break; 752 } 753 case State.END_TAG_OPEN: { 754 this._stateEndTagOpen(cp); 755 break; 756 } 757 case State.TAG_NAME: { 758 this._stateTagName(cp); 759 break; 760 } 761 case State.RCDATA_LESS_THAN_SIGN: { 762 this._stateRcdataLessThanSign(cp); 763 break; 764 } 765 case State.RCDATA_END_TAG_OPEN: { 766 this._stateRcdataEndTagOpen(cp); 767 break; 768 } 769 case State.RCDATA_END_TAG_NAME: { 770 this._stateRcdataEndTagName(cp); 771 break; 772 } 773 case State.RAWTEXT_LESS_THAN_SIGN: { 774 this._stateRawtextLessThanSign(cp); 775 break; 776 } 777 case State.RAWTEXT_END_TAG_OPEN: { 778 this._stateRawtextEndTagOpen(cp); 779 break; 780 } 781 case State.RAWTEXT_END_TAG_NAME: { 782 this._stateRawtextEndTagName(cp); 783 break; 784 } 785 case State.SCRIPT_DATA_LESS_THAN_SIGN: { 786 this._stateScriptDataLessThanSign(cp); 787 break; 788 } 789 case State.SCRIPT_DATA_END_TAG_OPEN: { 790 this._stateScriptDataEndTagOpen(cp); 791 break; 792 } 793 case State.SCRIPT_DATA_END_TAG_NAME: { 794 this._stateScriptDataEndTagName(cp); 795 break; 796 } 797 case State.SCRIPT_DATA_ESCAPE_START: { 798 this._stateScriptDataEscapeStart(cp); 799 break; 800 } 801 case State.SCRIPT_DATA_ESCAPE_START_DASH: { 802 this._stateScriptDataEscapeStartDash(cp); 803 break; 804 } 805 case State.SCRIPT_DATA_ESCAPED: { 806 this._stateScriptDataEscaped(cp); 807 break; 808 } 809 case State.SCRIPT_DATA_ESCAPED_DASH: { 810 this._stateScriptDataEscapedDash(cp); 811 break; 812 } 813 case State.SCRIPT_DATA_ESCAPED_DASH_DASH: { 814 this._stateScriptDataEscapedDashDash(cp); 815 break; 816 } 817 case State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: { 818 this._stateScriptDataEscapedLessThanSign(cp); 819 break; 820 } 821 case State.SCRIPT_DATA_ESCAPED_END_TAG_OPEN: { 822 this._stateScriptDataEscapedEndTagOpen(cp); 823 break; 824 } 825 case State.SCRIPT_DATA_ESCAPED_END_TAG_NAME: { 826 this._stateScriptDataEscapedEndTagName(cp); 827 break; 828 } 829 case State.SCRIPT_DATA_DOUBLE_ESCAPE_START: { 830 this._stateScriptDataDoubleEscapeStart(cp); 831 break; 832 } 833 case State.SCRIPT_DATA_DOUBLE_ESCAPED: { 834 this._stateScriptDataDoubleEscaped(cp); 835 break; 836 } 837 case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH: { 838 this._stateScriptDataDoubleEscapedDash(cp); 839 break; 840 } 841 case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: { 842 this._stateScriptDataDoubleEscapedDashDash(cp); 843 break; 844 } 845 case State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: { 846 this._stateScriptDataDoubleEscapedLessThanSign(cp); 847 break; 848 } 849 case State.SCRIPT_DATA_DOUBLE_ESCAPE_END: { 850 this._stateScriptDataDoubleEscapeEnd(cp); 851 break; 852 } 853 case State.BEFORE_ATTRIBUTE_NAME: { 854 this._stateBeforeAttributeName(cp); 855 break; 856 } 857 case State.ATTRIBUTE_NAME: { 858 this._stateAttributeName(cp); 859 break; 860 } 861 case State.AFTER_ATTRIBUTE_NAME: { 862 this._stateAfterAttributeName(cp); 863 break; 864 } 865 case State.BEFORE_ATTRIBUTE_VALUE: { 866 this._stateBeforeAttributeValue(cp); 867 break; 868 } 869 case State.ATTRIBUTE_VALUE_DOUBLE_QUOTED: { 870 this._stateAttributeValueDoubleQuoted(cp); 871 break; 872 } 873 case State.ATTRIBUTE_VALUE_SINGLE_QUOTED: { 874 this._stateAttributeValueSingleQuoted(cp); 875 break; 876 } 877 case State.ATTRIBUTE_VALUE_UNQUOTED: { 878 this._stateAttributeValueUnquoted(cp); 879 break; 880 } 881 case State.AFTER_ATTRIBUTE_VALUE_QUOTED: { 882 this._stateAfterAttributeValueQuoted(cp); 883 break; 884 } 885 case State.SELF_CLOSING_START_TAG: { 886 this._stateSelfClosingStartTag(cp); 887 break; 888 } 889 case State.BOGUS_COMMENT: { 890 this._stateBogusComment(cp); 891 break; 892 } 893 case State.MARKUP_DECLARATION_OPEN: { 894 this._stateMarkupDeclarationOpen(cp); 895 break; 896 } 897 case State.COMMENT_START: { 898 this._stateCommentStart(cp); 899 break; 900 } 901 case State.COMMENT_START_DASH: { 902 this._stateCommentStartDash(cp); 903 break; 904 } 905 case State.COMMENT: { 906 this._stateComment(cp); 907 break; 908 } 909 case State.COMMENT_LESS_THAN_SIGN: { 910 this._stateCommentLessThanSign(cp); 911 break; 912 } 913 case State.COMMENT_LESS_THAN_SIGN_BANG: { 914 this._stateCommentLessThanSignBang(cp); 915 break; 916 } 917 case State.COMMENT_LESS_THAN_SIGN_BANG_DASH: { 918 this._stateCommentLessThanSignBangDash(cp); 919 break; 920 } 921 case State.COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: { 922 this._stateCommentLessThanSignBangDashDash(cp); 923 break; 924 } 925 case State.COMMENT_END_DASH: { 926 this._stateCommentEndDash(cp); 927 break; 928 } 929 case State.COMMENT_END: { 930 this._stateCommentEnd(cp); 931 break; 932 } 933 case State.COMMENT_END_BANG: { 934 this._stateCommentEndBang(cp); 935 break; 936 } 937 case State.DOCTYPE: { 938 this._stateDoctype(cp); 939 break; 940 } 941 case State.BEFORE_DOCTYPE_NAME: { 942 this._stateBeforeDoctypeName(cp); 943 break; 944 } 945 case State.DOCTYPE_NAME: { 946 this._stateDoctypeName(cp); 947 break; 948 } 949 case State.AFTER_DOCTYPE_NAME: { 950 this._stateAfterDoctypeName(cp); 951 break; 952 } 953 case State.AFTER_DOCTYPE_PUBLIC_KEYWORD: { 954 this._stateAfterDoctypePublicKeyword(cp); 955 break; 956 } 957 case State.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: { 958 this._stateBeforeDoctypePublicIdentifier(cp); 959 break; 960 } 961 case State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: { 962 this._stateDoctypePublicIdentifierDoubleQuoted(cp); 963 break; 964 } 965 case State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: { 966 this._stateDoctypePublicIdentifierSingleQuoted(cp); 967 break; 968 } 969 case State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER: { 970 this._stateAfterDoctypePublicIdentifier(cp); 971 break; 972 } 973 case State.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: { 974 this._stateBetweenDoctypePublicAndSystemIdentifiers(cp); 975 break; 976 } 977 case State.AFTER_DOCTYPE_SYSTEM_KEYWORD: { 978 this._stateAfterDoctypeSystemKeyword(cp); 979 break; 980 } 981 case State.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: { 982 this._stateBeforeDoctypeSystemIdentifier(cp); 983 break; 984 } 985 case State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: { 986 this._stateDoctypeSystemIdentifierDoubleQuoted(cp); 987 break; 988 } 989 case State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: { 990 this._stateDoctypeSystemIdentifierSingleQuoted(cp); 991 break; 992 } 993 case State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER: { 994 this._stateAfterDoctypeSystemIdentifier(cp); 995 break; 996 } 997 case State.BOGUS_DOCTYPE: { 998 this._stateBogusDoctype(cp); 999 break; 1000 } 1001 case State.CDATA_SECTION: { 1002 this._stateCdataSection(cp); 1003 break; 1004 } 1005 case State.CDATA_SECTION_BRACKET: { 1006 this._stateCdataSectionBracket(cp); 1007 break; 1008 } 1009 case State.CDATA_SECTION_END: { 1010 this._stateCdataSectionEnd(cp); 1011 break; 1012 } 1013 case State.CHARACTER_REFERENCE: { 1014 this._stateCharacterReference(cp); 1015 break; 1016 } 1017 case State.NAMED_CHARACTER_REFERENCE: { 1018 this._stateNamedCharacterReference(cp); 1019 break; 1020 } 1021 case State.AMBIGUOUS_AMPERSAND: { 1022 this._stateAmbiguousAmpersand(cp); 1023 break; 1024 } 1025 case State.NUMERIC_CHARACTER_REFERENCE: { 1026 this._stateNumericCharacterReference(cp); 1027 break; 1028 } 1029 case State.HEXADEMICAL_CHARACTER_REFERENCE_START: { 1030 this._stateHexademicalCharacterReferenceStart(cp); 1031 break; 1032 } 1033 case State.DECIMAL_CHARACTER_REFERENCE_START: { 1034 this._stateDecimalCharacterReferenceStart(cp); 1035 break; 1036 } 1037 case State.HEXADEMICAL_CHARACTER_REFERENCE: { 1038 this._stateHexademicalCharacterReference(cp); 1039 break; 1040 } 1041 case State.DECIMAL_CHARACTER_REFERENCE: { 1042 this._stateDecimalCharacterReference(cp); 1043 break; 1044 } 1045 case State.NUMERIC_CHARACTER_REFERENCE_END: { 1046 this._stateNumericCharacterReferenceEnd(); 1047 break; 1048 } 1049 default: { 1050 throw new Error('Unknown state'); 1051 } 1052 } 1053 } 1054 1055 // State machine 1056 1057 // Data state 1058 //------------------------------------------------------------------ 1059 private _stateData(cp: number): void { 1060 switch (cp) { 1061 case $.LESS_THAN_SIGN: { 1062 this.state = State.TAG_OPEN; 1063 break; 1064 } 1065 case $.AMPERSAND: { 1066 this.returnState = State.DATA; 1067 this.state = State.CHARACTER_REFERENCE; 1068 break; 1069 } 1070 case $.NULL: { 1071 this._err(ERR.unexpectedNullCharacter); 1072 this._emitCodePoint(cp); 1073 break; 1074 } 1075 case $.EOF: { 1076 this._emitEOFToken(); 1077 break; 1078 } 1079 default: { 1080 this._emitCodePoint(cp); 1081 } 1082 } 1083 } 1084 1085 // RCDATA state 1086 //------------------------------------------------------------------ 1087 private _stateRcdata(cp: number): void { 1088 switch (cp) { 1089 case $.AMPERSAND: { 1090 this.returnState = State.RCDATA; 1091 this.state = State.CHARACTER_REFERENCE; 1092 break; 1093 } 1094 case $.LESS_THAN_SIGN: { 1095 this.state = State.RCDATA_LESS_THAN_SIGN; 1096 break; 1097 } 1098 case $.NULL: { 1099 this._err(ERR.unexpectedNullCharacter); 1100 this._emitChars(REPLACEMENT_CHARACTER); 1101 break; 1102 } 1103 case $.EOF: { 1104 this._emitEOFToken(); 1105 break; 1106 } 1107 default: { 1108 this._emitCodePoint(cp); 1109 } 1110 } 1111 } 1112 1113 // RAWTEXT state 1114 //------------------------------------------------------------------ 1115 private _stateRawtext(cp: number): void { 1116 switch (cp) { 1117 case $.LESS_THAN_SIGN: { 1118 this.state = State.RAWTEXT_LESS_THAN_SIGN; 1119 break; 1120 } 1121 case $.NULL: { 1122 this._err(ERR.unexpectedNullCharacter); 1123 this._emitChars(REPLACEMENT_CHARACTER); 1124 break; 1125 } 1126 case $.EOF: { 1127 this._emitEOFToken(); 1128 break; 1129 } 1130 default: { 1131 this._emitCodePoint(cp); 1132 } 1133 } 1134 } 1135 1136 // Script data state 1137 //------------------------------------------------------------------ 1138 private _stateScriptData(cp: number): void { 1139 switch (cp) { 1140 case $.LESS_THAN_SIGN: { 1141 this.state = State.SCRIPT_DATA_LESS_THAN_SIGN; 1142 break; 1143 } 1144 case $.NULL: { 1145 this._err(ERR.unexpectedNullCharacter); 1146 this._emitChars(REPLACEMENT_CHARACTER); 1147 break; 1148 } 1149 case $.EOF: { 1150 this._emitEOFToken(); 1151 break; 1152 } 1153 default: { 1154 this._emitCodePoint(cp); 1155 } 1156 } 1157 } 1158 1159 // PLAINTEXT state 1160 //------------------------------------------------------------------ 1161 private _statePlaintext(cp: number): void { 1162 switch (cp) { 1163 case $.NULL: { 1164 this._err(ERR.unexpectedNullCharacter); 1165 this._emitChars(REPLACEMENT_CHARACTER); 1166 break; 1167 } 1168 case $.EOF: { 1169 this._emitEOFToken(); 1170 break; 1171 } 1172 default: { 1173 this._emitCodePoint(cp); 1174 } 1175 } 1176 } 1177 1178 // Tag open state 1179 //------------------------------------------------------------------ 1180 private _stateTagOpen(cp: number): void { 1181 if (isAsciiLetter(cp)) { 1182 this._createStartTagToken(); 1183 this.state = State.TAG_NAME; 1184 this._stateTagName(cp); 1185 } else 1186 switch (cp) { 1187 case $.EXCLAMATION_MARK: { 1188 this.state = State.MARKUP_DECLARATION_OPEN; 1189 break; 1190 } 1191 case $.SOLIDUS: { 1192 this.state = State.END_TAG_OPEN; 1193 break; 1194 } 1195 case $.QUESTION_MARK: { 1196 this._err(ERR.unexpectedQuestionMarkInsteadOfTagName); 1197 this._createCommentToken(1); 1198 this.state = State.BOGUS_COMMENT; 1199 this._stateBogusComment(cp); 1200 break; 1201 } 1202 case $.EOF: { 1203 this._err(ERR.eofBeforeTagName); 1204 this._emitChars('<'); 1205 this._emitEOFToken(); 1206 break; 1207 } 1208 default: { 1209 this._err(ERR.invalidFirstCharacterOfTagName); 1210 this._emitChars('<'); 1211 this.state = State.DATA; 1212 this._stateData(cp); 1213 } 1214 } 1215 } 1216 1217 // End tag open state 1218 //------------------------------------------------------------------ 1219 private _stateEndTagOpen(cp: number): void { 1220 if (isAsciiLetter(cp)) { 1221 this._createEndTagToken(); 1222 this.state = State.TAG_NAME; 1223 this._stateTagName(cp); 1224 } else 1225 switch (cp) { 1226 case $.GREATER_THAN_SIGN: { 1227 this._err(ERR.missingEndTagName); 1228 this.state = State.DATA; 1229 break; 1230 } 1231 case $.EOF: { 1232 this._err(ERR.eofBeforeTagName); 1233 this._emitChars('</'); 1234 this._emitEOFToken(); 1235 break; 1236 } 1237 default: { 1238 this._err(ERR.invalidFirstCharacterOfTagName); 1239 this._createCommentToken(2); 1240 this.state = State.BOGUS_COMMENT; 1241 this._stateBogusComment(cp); 1242 } 1243 } 1244 } 1245 1246 // Tag name state 1247 //------------------------------------------------------------------ 1248 private _stateTagName(cp: number): void { 1249 const token = this.currentToken as TagToken; 1250 1251 switch (cp) { 1252 case $.SPACE: 1253 case $.LINE_FEED: 1254 case $.TABULATION: 1255 case $.FORM_FEED: { 1256 this.state = State.BEFORE_ATTRIBUTE_NAME; 1257 break; 1258 } 1259 case $.SOLIDUS: { 1260 this.state = State.SELF_CLOSING_START_TAG; 1261 break; 1262 } 1263 case $.GREATER_THAN_SIGN: { 1264 this.state = State.DATA; 1265 this.emitCurrentTagToken(); 1266 break; 1267 } 1268 case $.NULL: { 1269 this._err(ERR.unexpectedNullCharacter); 1270 token.tagName += REPLACEMENT_CHARACTER; 1271 break; 1272 } 1273 case $.EOF: { 1274 this._err(ERR.eofInTag); 1275 this._emitEOFToken(); 1276 break; 1277 } 1278 default: { 1279 token.tagName += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp); 1280 } 1281 } 1282 } 1283 1284 // RCDATA less-than sign state 1285 //------------------------------------------------------------------ 1286 private _stateRcdataLessThanSign(cp: number): void { 1287 if (cp === $.SOLIDUS) { 1288 this.state = State.RCDATA_END_TAG_OPEN; 1289 } else { 1290 this._emitChars('<'); 1291 this.state = State.RCDATA; 1292 this._stateRcdata(cp); 1293 } 1294 } 1295 1296 // RCDATA end tag open state 1297 //------------------------------------------------------------------ 1298 private _stateRcdataEndTagOpen(cp: number): void { 1299 if (isAsciiLetter(cp)) { 1300 this.state = State.RCDATA_END_TAG_NAME; 1301 this._stateRcdataEndTagName(cp); 1302 } else { 1303 this._emitChars('</'); 1304 this.state = State.RCDATA; 1305 this._stateRcdata(cp); 1306 } 1307 } 1308 1309 private handleSpecialEndTag(_cp: number): boolean { 1310 if (!this.preprocessor.startsWith(this.lastStartTagName, false)) { 1311 return !this._ensureHibernation(); 1312 } 1313 1314 this._createEndTagToken(); 1315 const token = this.currentToken as TagToken; 1316 token.tagName = this.lastStartTagName; 1317 1318 const cp = this.preprocessor.peek(this.lastStartTagName.length); 1319 1320 switch (cp) { 1321 case $.SPACE: 1322 case $.LINE_FEED: 1323 case $.TABULATION: 1324 case $.FORM_FEED: { 1325 this._advanceBy(this.lastStartTagName.length); 1326 this.state = State.BEFORE_ATTRIBUTE_NAME; 1327 return false; 1328 } 1329 case $.SOLIDUS: { 1330 this._advanceBy(this.lastStartTagName.length); 1331 this.state = State.SELF_CLOSING_START_TAG; 1332 return false; 1333 } 1334 case $.GREATER_THAN_SIGN: { 1335 this._advanceBy(this.lastStartTagName.length); 1336 this.emitCurrentTagToken(); 1337 this.state = State.DATA; 1338 return false; 1339 } 1340 default: { 1341 return !this._ensureHibernation(); 1342 } 1343 } 1344 } 1345 1346 // RCDATA end tag name state 1347 //------------------------------------------------------------------ 1348 private _stateRcdataEndTagName(cp: number): void { 1349 if (this.handleSpecialEndTag(cp)) { 1350 this._emitChars('</'); 1351 this.state = State.RCDATA; 1352 this._stateRcdata(cp); 1353 } 1354 } 1355 1356 // RAWTEXT less-than sign state 1357 //------------------------------------------------------------------ 1358 private _stateRawtextLessThanSign(cp: number): void { 1359 if (cp === $.SOLIDUS) { 1360 this.state = State.RAWTEXT_END_TAG_OPEN; 1361 } else { 1362 this._emitChars('<'); 1363 this.state = State.RAWTEXT; 1364 this._stateRawtext(cp); 1365 } 1366 } 1367 1368 // RAWTEXT end tag open state 1369 //------------------------------------------------------------------ 1370 private _stateRawtextEndTagOpen(cp: number): void { 1371 if (isAsciiLetter(cp)) { 1372 this.state = State.RAWTEXT_END_TAG_NAME; 1373 this._stateRawtextEndTagName(cp); 1374 } else { 1375 this._emitChars('</'); 1376 this.state = State.RAWTEXT; 1377 this._stateRawtext(cp); 1378 } 1379 } 1380 1381 // RAWTEXT end tag name state 1382 //------------------------------------------------------------------ 1383 private _stateRawtextEndTagName(cp: number): void { 1384 if (this.handleSpecialEndTag(cp)) { 1385 this._emitChars('</'); 1386 this.state = State.RAWTEXT; 1387 this._stateRawtext(cp); 1388 } 1389 } 1390 1391 // Script data less-than sign state 1392 //------------------------------------------------------------------ 1393 private _stateScriptDataLessThanSign(cp: number): void { 1394 switch (cp) { 1395 case $.SOLIDUS: { 1396 this.state = State.SCRIPT_DATA_END_TAG_OPEN; 1397 break; 1398 } 1399 case $.EXCLAMATION_MARK: { 1400 this.state = State.SCRIPT_DATA_ESCAPE_START; 1401 this._emitChars('<!'); 1402 break; 1403 } 1404 default: { 1405 this._emitChars('<'); 1406 this.state = State.SCRIPT_DATA; 1407 this._stateScriptData(cp); 1408 } 1409 } 1410 } 1411 1412 // Script data end tag open state 1413 //------------------------------------------------------------------ 1414 private _stateScriptDataEndTagOpen(cp: number): void { 1415 if (isAsciiLetter(cp)) { 1416 this.state = State.SCRIPT_DATA_END_TAG_NAME; 1417 this._stateScriptDataEndTagName(cp); 1418 } else { 1419 this._emitChars('</'); 1420 this.state = State.SCRIPT_DATA; 1421 this._stateScriptData(cp); 1422 } 1423 } 1424 1425 // Script data end tag name state 1426 //------------------------------------------------------------------ 1427 private _stateScriptDataEndTagName(cp: number): void { 1428 if (this.handleSpecialEndTag(cp)) { 1429 this._emitChars('</'); 1430 this.state = State.SCRIPT_DATA; 1431 this._stateScriptData(cp); 1432 } 1433 } 1434 1435 // Script data escape start state 1436 //------------------------------------------------------------------ 1437 private _stateScriptDataEscapeStart(cp: number): void { 1438 if (cp === $.HYPHEN_MINUS) { 1439 this.state = State.SCRIPT_DATA_ESCAPE_START_DASH; 1440 this._emitChars('-'); 1441 } else { 1442 this.state = State.SCRIPT_DATA; 1443 this._stateScriptData(cp); 1444 } 1445 } 1446 1447 // Script data escape start dash state 1448 //------------------------------------------------------------------ 1449 private _stateScriptDataEscapeStartDash(cp: number): void { 1450 if (cp === $.HYPHEN_MINUS) { 1451 this.state = State.SCRIPT_DATA_ESCAPED_DASH_DASH; 1452 this._emitChars('-'); 1453 } else { 1454 this.state = State.SCRIPT_DATA; 1455 this._stateScriptData(cp); 1456 } 1457 } 1458 1459 // Script data escaped state 1460 //------------------------------------------------------------------ 1461 private _stateScriptDataEscaped(cp: number): void { 1462 switch (cp) { 1463 case $.HYPHEN_MINUS: { 1464 this.state = State.SCRIPT_DATA_ESCAPED_DASH; 1465 this._emitChars('-'); 1466 break; 1467 } 1468 case $.LESS_THAN_SIGN: { 1469 this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; 1470 break; 1471 } 1472 case $.NULL: { 1473 this._err(ERR.unexpectedNullCharacter); 1474 this._emitChars(REPLACEMENT_CHARACTER); 1475 break; 1476 } 1477 case $.EOF: { 1478 this._err(ERR.eofInScriptHtmlCommentLikeText); 1479 this._emitEOFToken(); 1480 break; 1481 } 1482 default: { 1483 this._emitCodePoint(cp); 1484 } 1485 } 1486 } 1487 1488 // Script data escaped dash state 1489 //------------------------------------------------------------------ 1490 private _stateScriptDataEscapedDash(cp: number): void { 1491 switch (cp) { 1492 case $.HYPHEN_MINUS: { 1493 this.state = State.SCRIPT_DATA_ESCAPED_DASH_DASH; 1494 this._emitChars('-'); 1495 break; 1496 } 1497 case $.LESS_THAN_SIGN: { 1498 this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; 1499 break; 1500 } 1501 case $.NULL: { 1502 this._err(ERR.unexpectedNullCharacter); 1503 this.state = State.SCRIPT_DATA_ESCAPED; 1504 this._emitChars(REPLACEMENT_CHARACTER); 1505 break; 1506 } 1507 case $.EOF: { 1508 this._err(ERR.eofInScriptHtmlCommentLikeText); 1509 this._emitEOFToken(); 1510 break; 1511 } 1512 default: { 1513 this.state = State.SCRIPT_DATA_ESCAPED; 1514 this._emitCodePoint(cp); 1515 } 1516 } 1517 } 1518 1519 // Script data escaped dash dash state 1520 //------------------------------------------------------------------ 1521 private _stateScriptDataEscapedDashDash(cp: number): void { 1522 switch (cp) { 1523 case $.HYPHEN_MINUS: { 1524 this._emitChars('-'); 1525 break; 1526 } 1527 case $.LESS_THAN_SIGN: { 1528 this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; 1529 break; 1530 } 1531 case $.GREATER_THAN_SIGN: { 1532 this.state = State.SCRIPT_DATA; 1533 this._emitChars('>'); 1534 break; 1535 } 1536 case $.NULL: { 1537 this._err(ERR.unexpectedNullCharacter); 1538 this.state = State.SCRIPT_DATA_ESCAPED; 1539 this._emitChars(REPLACEMENT_CHARACTER); 1540 break; 1541 } 1542 case $.EOF: { 1543 this._err(ERR.eofInScriptHtmlCommentLikeText); 1544 this._emitEOFToken(); 1545 break; 1546 } 1547 default: { 1548 this.state = State.SCRIPT_DATA_ESCAPED; 1549 this._emitCodePoint(cp); 1550 } 1551 } 1552 } 1553 1554 // Script data escaped less-than sign state 1555 //------------------------------------------------------------------ 1556 private _stateScriptDataEscapedLessThanSign(cp: number): void { 1557 if (cp === $.SOLIDUS) { 1558 this.state = State.SCRIPT_DATA_ESCAPED_END_TAG_OPEN; 1559 } else if (isAsciiLetter(cp)) { 1560 this._emitChars('<'); 1561 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPE_START; 1562 this._stateScriptDataDoubleEscapeStart(cp); 1563 } else { 1564 this._emitChars('<'); 1565 this.state = State.SCRIPT_DATA_ESCAPED; 1566 this._stateScriptDataEscaped(cp); 1567 } 1568 } 1569 1570 // Script data escaped end tag open state 1571 //------------------------------------------------------------------ 1572 private _stateScriptDataEscapedEndTagOpen(cp: number): void { 1573 if (isAsciiLetter(cp)) { 1574 this.state = State.SCRIPT_DATA_ESCAPED_END_TAG_NAME; 1575 this._stateScriptDataEscapedEndTagName(cp); 1576 } else { 1577 this._emitChars('</'); 1578 this.state = State.SCRIPT_DATA_ESCAPED; 1579 this._stateScriptDataEscaped(cp); 1580 } 1581 } 1582 1583 // Script data escaped end tag name state 1584 //------------------------------------------------------------------ 1585 private _stateScriptDataEscapedEndTagName(cp: number): void { 1586 if (this.handleSpecialEndTag(cp)) { 1587 this._emitChars('</'); 1588 this.state = State.SCRIPT_DATA_ESCAPED; 1589 this._stateScriptDataEscaped(cp); 1590 } 1591 } 1592 1593 // Script data double escape start state 1594 //------------------------------------------------------------------ 1595 private _stateScriptDataDoubleEscapeStart(cp: number): void { 1596 if ( 1597 this.preprocessor.startsWith($$.SCRIPT, false) && 1598 isScriptDataDoubleEscapeSequenceEnd(this.preprocessor.peek($$.SCRIPT.length)) 1599 ) { 1600 this._emitCodePoint(cp); 1601 for (let i = 0; i < $$.SCRIPT.length; i++) { 1602 this._emitCodePoint(this._consume()); 1603 } 1604 1605 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1606 } else if (!this._ensureHibernation()) { 1607 this.state = State.SCRIPT_DATA_ESCAPED; 1608 this._stateScriptDataEscaped(cp); 1609 } 1610 } 1611 1612 // Script data double escaped state 1613 //------------------------------------------------------------------ 1614 private _stateScriptDataDoubleEscaped(cp: number): void { 1615 switch (cp) { 1616 case $.HYPHEN_MINUS: { 1617 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH; 1618 this._emitChars('-'); 1619 break; 1620 } 1621 case $.LESS_THAN_SIGN: { 1622 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; 1623 this._emitChars('<'); 1624 break; 1625 } 1626 case $.NULL: { 1627 this._err(ERR.unexpectedNullCharacter); 1628 this._emitChars(REPLACEMENT_CHARACTER); 1629 break; 1630 } 1631 case $.EOF: { 1632 this._err(ERR.eofInScriptHtmlCommentLikeText); 1633 this._emitEOFToken(); 1634 break; 1635 } 1636 default: { 1637 this._emitCodePoint(cp); 1638 } 1639 } 1640 } 1641 1642 // Script data double escaped dash state 1643 //------------------------------------------------------------------ 1644 private _stateScriptDataDoubleEscapedDash(cp: number): void { 1645 switch (cp) { 1646 case $.HYPHEN_MINUS: { 1647 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; 1648 this._emitChars('-'); 1649 break; 1650 } 1651 case $.LESS_THAN_SIGN: { 1652 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; 1653 this._emitChars('<'); 1654 break; 1655 } 1656 case $.NULL: { 1657 this._err(ERR.unexpectedNullCharacter); 1658 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1659 this._emitChars(REPLACEMENT_CHARACTER); 1660 break; 1661 } 1662 case $.EOF: { 1663 this._err(ERR.eofInScriptHtmlCommentLikeText); 1664 this._emitEOFToken(); 1665 break; 1666 } 1667 default: { 1668 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1669 this._emitCodePoint(cp); 1670 } 1671 } 1672 } 1673 1674 // Script data double escaped dash dash state 1675 //------------------------------------------------------------------ 1676 private _stateScriptDataDoubleEscapedDashDash(cp: number): void { 1677 switch (cp) { 1678 case $.HYPHEN_MINUS: { 1679 this._emitChars('-'); 1680 break; 1681 } 1682 case $.LESS_THAN_SIGN: { 1683 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; 1684 this._emitChars('<'); 1685 break; 1686 } 1687 case $.GREATER_THAN_SIGN: { 1688 this.state = State.SCRIPT_DATA; 1689 this._emitChars('>'); 1690 break; 1691 } 1692 case $.NULL: { 1693 this._err(ERR.unexpectedNullCharacter); 1694 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1695 this._emitChars(REPLACEMENT_CHARACTER); 1696 break; 1697 } 1698 case $.EOF: { 1699 this._err(ERR.eofInScriptHtmlCommentLikeText); 1700 this._emitEOFToken(); 1701 break; 1702 } 1703 default: { 1704 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1705 this._emitCodePoint(cp); 1706 } 1707 } 1708 } 1709 1710 // Script data double escaped less-than sign state 1711 //------------------------------------------------------------------ 1712 private _stateScriptDataDoubleEscapedLessThanSign(cp: number): void { 1713 if (cp === $.SOLIDUS) { 1714 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPE_END; 1715 this._emitChars('/'); 1716 } else { 1717 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1718 this._stateScriptDataDoubleEscaped(cp); 1719 } 1720 } 1721 1722 // Script data double escape end state 1723 //------------------------------------------------------------------ 1724 private _stateScriptDataDoubleEscapeEnd(cp: number): void { 1725 if ( 1726 this.preprocessor.startsWith($$.SCRIPT, false) && 1727 isScriptDataDoubleEscapeSequenceEnd(this.preprocessor.peek($$.SCRIPT.length)) 1728 ) { 1729 this._emitCodePoint(cp); 1730 for (let i = 0; i < $$.SCRIPT.length; i++) { 1731 this._emitCodePoint(this._consume()); 1732 } 1733 1734 this.state = State.SCRIPT_DATA_ESCAPED; 1735 } else if (!this._ensureHibernation()) { 1736 this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; 1737 this._stateScriptDataDoubleEscaped(cp); 1738 } 1739 } 1740 1741 // Before attribute name state 1742 //------------------------------------------------------------------ 1743 private _stateBeforeAttributeName(cp: number): void { 1744 switch (cp) { 1745 case $.SPACE: 1746 case $.LINE_FEED: 1747 case $.TABULATION: 1748 case $.FORM_FEED: { 1749 // Ignore whitespace 1750 break; 1751 } 1752 case $.SOLIDUS: 1753 case $.GREATER_THAN_SIGN: 1754 case $.EOF: { 1755 this.state = State.AFTER_ATTRIBUTE_NAME; 1756 this._stateAfterAttributeName(cp); 1757 break; 1758 } 1759 case $.EQUALS_SIGN: { 1760 this._err(ERR.unexpectedEqualsSignBeforeAttributeName); 1761 this._createAttr('='); 1762 this.state = State.ATTRIBUTE_NAME; 1763 break; 1764 } 1765 default: { 1766 this._createAttr(''); 1767 this.state = State.ATTRIBUTE_NAME; 1768 this._stateAttributeName(cp); 1769 } 1770 } 1771 } 1772 1773 // Attribute name state 1774 //------------------------------------------------------------------ 1775 private _stateAttributeName(cp: number): void { 1776 switch (cp) { 1777 case $.SPACE: 1778 case $.LINE_FEED: 1779 case $.TABULATION: 1780 case $.FORM_FEED: 1781 case $.SOLIDUS: 1782 case $.GREATER_THAN_SIGN: 1783 case $.EOF: { 1784 this._leaveAttrName(); 1785 this.state = State.AFTER_ATTRIBUTE_NAME; 1786 this._stateAfterAttributeName(cp); 1787 break; 1788 } 1789 case $.EQUALS_SIGN: { 1790 this._leaveAttrName(); 1791 this.state = State.BEFORE_ATTRIBUTE_VALUE; 1792 break; 1793 } 1794 case $.QUOTATION_MARK: 1795 case $.APOSTROPHE: 1796 case $.LESS_THAN_SIGN: { 1797 this._err(ERR.unexpectedCharacterInAttributeName); 1798 this.currentAttr.name += String.fromCodePoint(cp); 1799 break; 1800 } 1801 case $.NULL: { 1802 this._err(ERR.unexpectedNullCharacter); 1803 this.currentAttr.name += REPLACEMENT_CHARACTER; 1804 break; 1805 } 1806 default: { 1807 this.currentAttr.name += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp); 1808 } 1809 } 1810 } 1811 1812 // After attribute name state 1813 //------------------------------------------------------------------ 1814 private _stateAfterAttributeName(cp: number): void { 1815 switch (cp) { 1816 case $.SPACE: 1817 case $.LINE_FEED: 1818 case $.TABULATION: 1819 case $.FORM_FEED: { 1820 // Ignore whitespace 1821 break; 1822 } 1823 case $.SOLIDUS: { 1824 this.state = State.SELF_CLOSING_START_TAG; 1825 break; 1826 } 1827 case $.EQUALS_SIGN: { 1828 this.state = State.BEFORE_ATTRIBUTE_VALUE; 1829 break; 1830 } 1831 case $.GREATER_THAN_SIGN: { 1832 this.state = State.DATA; 1833 this.emitCurrentTagToken(); 1834 break; 1835 } 1836 case $.EOF: { 1837 this._err(ERR.eofInTag); 1838 this._emitEOFToken(); 1839 break; 1840 } 1841 default: { 1842 this._createAttr(''); 1843 this.state = State.ATTRIBUTE_NAME; 1844 this._stateAttributeName(cp); 1845 } 1846 } 1847 } 1848 1849 // Before attribute value state 1850 //------------------------------------------------------------------ 1851 private _stateBeforeAttributeValue(cp: number): void { 1852 switch (cp) { 1853 case $.SPACE: 1854 case $.LINE_FEED: 1855 case $.TABULATION: 1856 case $.FORM_FEED: { 1857 // Ignore whitespace 1858 break; 1859 } 1860 case $.QUOTATION_MARK: { 1861 this.state = State.ATTRIBUTE_VALUE_DOUBLE_QUOTED; 1862 break; 1863 } 1864 case $.APOSTROPHE: { 1865 this.state = State.ATTRIBUTE_VALUE_SINGLE_QUOTED; 1866 break; 1867 } 1868 case $.GREATER_THAN_SIGN: { 1869 this._err(ERR.missingAttributeValue); 1870 this.state = State.DATA; 1871 this.emitCurrentTagToken(); 1872 break; 1873 } 1874 default: { 1875 this.state = State.ATTRIBUTE_VALUE_UNQUOTED; 1876 this._stateAttributeValueUnquoted(cp); 1877 } 1878 } 1879 } 1880 1881 // Attribute value (double-quoted) state 1882 //------------------------------------------------------------------ 1883 private _stateAttributeValueDoubleQuoted(cp: number): void { 1884 switch (cp) { 1885 case $.QUOTATION_MARK: { 1886 this.state = State.AFTER_ATTRIBUTE_VALUE_QUOTED; 1887 break; 1888 } 1889 case $.AMPERSAND: { 1890 this.returnState = State.ATTRIBUTE_VALUE_DOUBLE_QUOTED; 1891 this.state = State.CHARACTER_REFERENCE; 1892 break; 1893 } 1894 case $.NULL: { 1895 this._err(ERR.unexpectedNullCharacter); 1896 this.currentAttr.value += REPLACEMENT_CHARACTER; 1897 break; 1898 } 1899 case $.EOF: { 1900 this._err(ERR.eofInTag); 1901 this._emitEOFToken(); 1902 break; 1903 } 1904 default: { 1905 this.currentAttr.value += String.fromCodePoint(cp); 1906 } 1907 } 1908 } 1909 1910 // Attribute value (single-quoted) state 1911 //------------------------------------------------------------------ 1912 private _stateAttributeValueSingleQuoted(cp: number): void { 1913 switch (cp) { 1914 case $.APOSTROPHE: { 1915 this.state = State.AFTER_ATTRIBUTE_VALUE_QUOTED; 1916 break; 1917 } 1918 case $.AMPERSAND: { 1919 this.returnState = State.ATTRIBUTE_VALUE_SINGLE_QUOTED; 1920 this.state = State.CHARACTER_REFERENCE; 1921 break; 1922 } 1923 case $.NULL: { 1924 this._err(ERR.unexpectedNullCharacter); 1925 this.currentAttr.value += REPLACEMENT_CHARACTER; 1926 break; 1927 } 1928 case $.EOF: { 1929 this._err(ERR.eofInTag); 1930 this._emitEOFToken(); 1931 break; 1932 } 1933 default: { 1934 this.currentAttr.value += String.fromCodePoint(cp); 1935 } 1936 } 1937 } 1938 1939 // Attribute value (unquoted) state 1940 //------------------------------------------------------------------ 1941 private _stateAttributeValueUnquoted(cp: number): void { 1942 switch (cp) { 1943 case $.SPACE: 1944 case $.LINE_FEED: 1945 case $.TABULATION: 1946 case $.FORM_FEED: { 1947 this._leaveAttrValue(); 1948 this.state = State.BEFORE_ATTRIBUTE_NAME; 1949 break; 1950 } 1951 case $.AMPERSAND: { 1952 this.returnState = State.ATTRIBUTE_VALUE_UNQUOTED; 1953 this.state = State.CHARACTER_REFERENCE; 1954 break; 1955 } 1956 case $.GREATER_THAN_SIGN: { 1957 this._leaveAttrValue(); 1958 this.state = State.DATA; 1959 this.emitCurrentTagToken(); 1960 break; 1961 } 1962 case $.NULL: { 1963 this._err(ERR.unexpectedNullCharacter); 1964 this.currentAttr.value += REPLACEMENT_CHARACTER; 1965 break; 1966 } 1967 case $.QUOTATION_MARK: 1968 case $.APOSTROPHE: 1969 case $.LESS_THAN_SIGN: 1970 case $.EQUALS_SIGN: 1971 case $.GRAVE_ACCENT: { 1972 this._err(ERR.unexpectedCharacterInUnquotedAttributeValue); 1973 this.currentAttr.value += String.fromCodePoint(cp); 1974 break; 1975 } 1976 case $.EOF: { 1977 this._err(ERR.eofInTag); 1978 this._emitEOFToken(); 1979 break; 1980 } 1981 default: { 1982 this.currentAttr.value += String.fromCodePoint(cp); 1983 } 1984 } 1985 } 1986 1987 // After attribute value (quoted) state 1988 //------------------------------------------------------------------ 1989 private _stateAfterAttributeValueQuoted(cp: number): void { 1990 switch (cp) { 1991 case $.SPACE: 1992 case $.LINE_FEED: 1993 case $.TABULATION: 1994 case $.FORM_FEED: { 1995 this._leaveAttrValue(); 1996 this.state = State.BEFORE_ATTRIBUTE_NAME; 1997 break; 1998 } 1999 case $.SOLIDUS: { 2000 this._leaveAttrValue(); 2001 this.state = State.SELF_CLOSING_START_TAG; 2002 break; 2003 } 2004 case $.GREATER_THAN_SIGN: { 2005 this._leaveAttrValue(); 2006 this.state = State.DATA; 2007 this.emitCurrentTagToken(); 2008 break; 2009 } 2010 case $.EOF: { 2011 this._err(ERR.eofInTag); 2012 this._emitEOFToken(); 2013 break; 2014 } 2015 default: { 2016 this._err(ERR.missingWhitespaceBetweenAttributes); 2017 this.state = State.BEFORE_ATTRIBUTE_NAME; 2018 this._stateBeforeAttributeName(cp); 2019 } 2020 } 2021 } 2022 2023 // Self-closing start tag state 2024 //------------------------------------------------------------------ 2025 private _stateSelfClosingStartTag(cp: number): void { 2026 switch (cp) { 2027 case $.GREATER_THAN_SIGN: { 2028 const token = this.currentToken as TagToken; 2029 token.selfClosing = true; 2030 this.state = State.DATA; 2031 this.emitCurrentTagToken(); 2032 break; 2033 } 2034 case $.EOF: { 2035 this._err(ERR.eofInTag); 2036 this._emitEOFToken(); 2037 break; 2038 } 2039 default: { 2040 this._err(ERR.unexpectedSolidusInTag); 2041 this.state = State.BEFORE_ATTRIBUTE_NAME; 2042 this._stateBeforeAttributeName(cp); 2043 } 2044 } 2045 } 2046 2047 // Bogus comment state 2048 //------------------------------------------------------------------ 2049 private _stateBogusComment(cp: number): void { 2050 const token = this.currentToken as CommentToken; 2051 2052 switch (cp) { 2053 case $.GREATER_THAN_SIGN: { 2054 this.state = State.DATA; 2055 this.emitCurrentComment(token); 2056 break; 2057 } 2058 case $.EOF: { 2059 this.emitCurrentComment(token); 2060 this._emitEOFToken(); 2061 break; 2062 } 2063 case $.NULL: { 2064 this._err(ERR.unexpectedNullCharacter); 2065 token.data += REPLACEMENT_CHARACTER; 2066 break; 2067 } 2068 default: { 2069 token.data += String.fromCodePoint(cp); 2070 } 2071 } 2072 } 2073 2074 // Markup declaration open state 2075 //------------------------------------------------------------------ 2076 private _stateMarkupDeclarationOpen(cp: number): void { 2077 if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) { 2078 this._createCommentToken($$.DASH_DASH.length + 1); 2079 this.state = State.COMMENT_START; 2080 } else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) { 2081 // NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here. 2082 this.currentLocation = this.getCurrentLocation($$.DOCTYPE.length + 1); 2083 this.state = State.DOCTYPE; 2084 } else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) { 2085 if (this.inForeignNode) { 2086 this.state = State.CDATA_SECTION; 2087 } else { 2088 this._err(ERR.cdataInHtmlContent); 2089 this._createCommentToken($$.CDATA_START.length + 1); 2090 (this.currentToken as CommentToken).data = '[CDATA['; 2091 this.state = State.BOGUS_COMMENT; 2092 } 2093 } 2094 2095 //NOTE: Sequence lookups can be abrupted by hibernation. In that case, lookup 2096 //results are no longer valid and we will need to start over. 2097 else if (!this._ensureHibernation()) { 2098 this._err(ERR.incorrectlyOpenedComment); 2099 this._createCommentToken(2); 2100 this.state = State.BOGUS_COMMENT; 2101 this._stateBogusComment(cp); 2102 } 2103 } 2104 2105 // Comment start state 2106 //------------------------------------------------------------------ 2107 private _stateCommentStart(cp: number): void { 2108 switch (cp) { 2109 case $.HYPHEN_MINUS: { 2110 this.state = State.COMMENT_START_DASH; 2111 break; 2112 } 2113 case $.GREATER_THAN_SIGN: { 2114 this._err(ERR.abruptClosingOfEmptyComment); 2115 this.state = State.DATA; 2116 const token = this.currentToken as CommentToken; 2117 this.emitCurrentComment(token); 2118 break; 2119 } 2120 default: { 2121 this.state = State.COMMENT; 2122 this._stateComment(cp); 2123 } 2124 } 2125 } 2126 2127 // Comment start dash state 2128 //------------------------------------------------------------------ 2129 private _stateCommentStartDash(cp: number): void { 2130 const token = this.currentToken as CommentToken; 2131 switch (cp) { 2132 case $.HYPHEN_MINUS: { 2133 this.state = State.COMMENT_END; 2134 break; 2135 } 2136 case $.GREATER_THAN_SIGN: { 2137 this._err(ERR.abruptClosingOfEmptyComment); 2138 this.state = State.DATA; 2139 this.emitCurrentComment(token); 2140 break; 2141 } 2142 case $.EOF: { 2143 this._err(ERR.eofInComment); 2144 this.emitCurrentComment(token); 2145 this._emitEOFToken(); 2146 break; 2147 } 2148 default: { 2149 token.data += '-'; 2150 this.state = State.COMMENT; 2151 this._stateComment(cp); 2152 } 2153 } 2154 } 2155 2156 // Comment state 2157 //------------------------------------------------------------------ 2158 private _stateComment(cp: number): void { 2159 const token = this.currentToken as CommentToken; 2160 2161 switch (cp) { 2162 case $.HYPHEN_MINUS: { 2163 this.state = State.COMMENT_END_DASH; 2164 break; 2165 } 2166 case $.LESS_THAN_SIGN: { 2167 token.data += '<'; 2168 this.state = State.COMMENT_LESS_THAN_SIGN; 2169 break; 2170 } 2171 case $.NULL: { 2172 this._err(ERR.unexpectedNullCharacter); 2173 token.data += REPLACEMENT_CHARACTER; 2174 break; 2175 } 2176 case $.EOF: { 2177 this._err(ERR.eofInComment); 2178 this.emitCurrentComment(token); 2179 this._emitEOFToken(); 2180 break; 2181 } 2182 default: { 2183 token.data += String.fromCodePoint(cp); 2184 } 2185 } 2186 } 2187 2188 // Comment less-than sign state 2189 //------------------------------------------------------------------ 2190 private _stateCommentLessThanSign(cp: number): void { 2191 const token = this.currentToken as CommentToken; 2192 2193 switch (cp) { 2194 case $.EXCLAMATION_MARK: { 2195 token.data += '!'; 2196 this.state = State.COMMENT_LESS_THAN_SIGN_BANG; 2197 break; 2198 } 2199 case $.LESS_THAN_SIGN: { 2200 token.data += '<'; 2201 break; 2202 } 2203 default: { 2204 this.state = State.COMMENT; 2205 this._stateComment(cp); 2206 } 2207 } 2208 } 2209 2210 // Comment less-than sign bang state 2211 //------------------------------------------------------------------ 2212 private _stateCommentLessThanSignBang(cp: number): void { 2213 if (cp === $.HYPHEN_MINUS) { 2214 this.state = State.COMMENT_LESS_THAN_SIGN_BANG_DASH; 2215 } else { 2216 this.state = State.COMMENT; 2217 this._stateComment(cp); 2218 } 2219 } 2220 2221 // Comment less-than sign bang dash state 2222 //------------------------------------------------------------------ 2223 private _stateCommentLessThanSignBangDash(cp: number): void { 2224 if (cp === $.HYPHEN_MINUS) { 2225 this.state = State.COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH; 2226 } else { 2227 this.state = State.COMMENT_END_DASH; 2228 this._stateCommentEndDash(cp); 2229 } 2230 } 2231 2232 // Comment less-than sign bang dash dash state 2233 //------------------------------------------------------------------ 2234 private _stateCommentLessThanSignBangDashDash(cp: number): void { 2235 if (cp !== $.GREATER_THAN_SIGN && cp !== $.EOF) { 2236 this._err(ERR.nestedComment); 2237 } 2238 2239 this.state = State.COMMENT_END; 2240 this._stateCommentEnd(cp); 2241 } 2242 2243 // Comment end dash state 2244 //------------------------------------------------------------------ 2245 private _stateCommentEndDash(cp: number): void { 2246 const token = this.currentToken as CommentToken; 2247 switch (cp) { 2248 case $.HYPHEN_MINUS: { 2249 this.state = State.COMMENT_END; 2250 break; 2251 } 2252 case $.EOF: { 2253 this._err(ERR.eofInComment); 2254 this.emitCurrentComment(token); 2255 this._emitEOFToken(); 2256 break; 2257 } 2258 default: { 2259 token.data += '-'; 2260 this.state = State.COMMENT; 2261 this._stateComment(cp); 2262 } 2263 } 2264 } 2265 2266 // Comment end state 2267 //------------------------------------------------------------------ 2268 private _stateCommentEnd(cp: number): void { 2269 const token = this.currentToken as CommentToken; 2270 2271 switch (cp) { 2272 case $.GREATER_THAN_SIGN: { 2273 this.state = State.DATA; 2274 this.emitCurrentComment(token); 2275 break; 2276 } 2277 case $.EXCLAMATION_MARK: { 2278 this.state = State.COMMENT_END_BANG; 2279 break; 2280 } 2281 case $.HYPHEN_MINUS: { 2282 token.data += '-'; 2283 break; 2284 } 2285 case $.EOF: { 2286 this._err(ERR.eofInComment); 2287 this.emitCurrentComment(token); 2288 this._emitEOFToken(); 2289 break; 2290 } 2291 default: { 2292 token.data += '--'; 2293 this.state = State.COMMENT; 2294 this._stateComment(cp); 2295 } 2296 } 2297 } 2298 2299 // Comment end bang state 2300 //------------------------------------------------------------------ 2301 private _stateCommentEndBang(cp: number): void { 2302 const token = this.currentToken as CommentToken; 2303 2304 switch (cp) { 2305 case $.HYPHEN_MINUS: { 2306 token.data += '--!'; 2307 this.state = State.COMMENT_END_DASH; 2308 break; 2309 } 2310 case $.GREATER_THAN_SIGN: { 2311 this._err(ERR.incorrectlyClosedComment); 2312 this.state = State.DATA; 2313 this.emitCurrentComment(token); 2314 break; 2315 } 2316 case $.EOF: { 2317 this._err(ERR.eofInComment); 2318 this.emitCurrentComment(token); 2319 this._emitEOFToken(); 2320 break; 2321 } 2322 default: { 2323 token.data += '--!'; 2324 this.state = State.COMMENT; 2325 this._stateComment(cp); 2326 } 2327 } 2328 } 2329 2330 // DOCTYPE state 2331 //------------------------------------------------------------------ 2332 private _stateDoctype(cp: number): void { 2333 switch (cp) { 2334 case $.SPACE: 2335 case $.LINE_FEED: 2336 case $.TABULATION: 2337 case $.FORM_FEED: { 2338 this.state = State.BEFORE_DOCTYPE_NAME; 2339 break; 2340 } 2341 case $.GREATER_THAN_SIGN: { 2342 this.state = State.BEFORE_DOCTYPE_NAME; 2343 this._stateBeforeDoctypeName(cp); 2344 break; 2345 } 2346 case $.EOF: { 2347 this._err(ERR.eofInDoctype); 2348 this._createDoctypeToken(null); 2349 const token = this.currentToken as DoctypeToken; 2350 token.forceQuirks = true; 2351 this.emitCurrentDoctype(token); 2352 this._emitEOFToken(); 2353 break; 2354 } 2355 default: { 2356 this._err(ERR.missingWhitespaceBeforeDoctypeName); 2357 this.state = State.BEFORE_DOCTYPE_NAME; 2358 this._stateBeforeDoctypeName(cp); 2359 } 2360 } 2361 } 2362 2363 // Before DOCTYPE name state 2364 //------------------------------------------------------------------ 2365 private _stateBeforeDoctypeName(cp: number): void { 2366 if (isAsciiUpper(cp)) { 2367 this._createDoctypeToken(String.fromCharCode(toAsciiLower(cp))); 2368 this.state = State.DOCTYPE_NAME; 2369 } else 2370 switch (cp) { 2371 case $.SPACE: 2372 case $.LINE_FEED: 2373 case $.TABULATION: 2374 case $.FORM_FEED: { 2375 // Ignore whitespace 2376 break; 2377 } 2378 case $.NULL: { 2379 this._err(ERR.unexpectedNullCharacter); 2380 this._createDoctypeToken(REPLACEMENT_CHARACTER); 2381 this.state = State.DOCTYPE_NAME; 2382 break; 2383 } 2384 case $.GREATER_THAN_SIGN: { 2385 this._err(ERR.missingDoctypeName); 2386 this._createDoctypeToken(null); 2387 const token = this.currentToken as DoctypeToken; 2388 token.forceQuirks = true; 2389 this.emitCurrentDoctype(token); 2390 this.state = State.DATA; 2391 break; 2392 } 2393 case $.EOF: { 2394 this._err(ERR.eofInDoctype); 2395 this._createDoctypeToken(null); 2396 const token = this.currentToken as DoctypeToken; 2397 token.forceQuirks = true; 2398 this.emitCurrentDoctype(token); 2399 this._emitEOFToken(); 2400 break; 2401 } 2402 default: { 2403 this._createDoctypeToken(String.fromCodePoint(cp)); 2404 this.state = State.DOCTYPE_NAME; 2405 } 2406 } 2407 } 2408 2409 // DOCTYPE name state 2410 //------------------------------------------------------------------ 2411 private _stateDoctypeName(cp: number): void { 2412 const token = this.currentToken as DoctypeToken; 2413 2414 switch (cp) { 2415 case $.SPACE: 2416 case $.LINE_FEED: 2417 case $.TABULATION: 2418 case $.FORM_FEED: { 2419 this.state = State.AFTER_DOCTYPE_NAME; 2420 break; 2421 } 2422 case $.GREATER_THAN_SIGN: { 2423 this.state = State.DATA; 2424 this.emitCurrentDoctype(token); 2425 break; 2426 } 2427 case $.NULL: { 2428 this._err(ERR.unexpectedNullCharacter); 2429 token.name += REPLACEMENT_CHARACTER; 2430 break; 2431 } 2432 case $.EOF: { 2433 this._err(ERR.eofInDoctype); 2434 token.forceQuirks = true; 2435 this.emitCurrentDoctype(token); 2436 this._emitEOFToken(); 2437 break; 2438 } 2439 default: { 2440 token.name += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp); 2441 } 2442 } 2443 } 2444 2445 // After DOCTYPE name state 2446 //------------------------------------------------------------------ 2447 private _stateAfterDoctypeName(cp: number): void { 2448 const token = this.currentToken as DoctypeToken; 2449 2450 switch (cp) { 2451 case $.SPACE: 2452 case $.LINE_FEED: 2453 case $.TABULATION: 2454 case $.FORM_FEED: { 2455 // Ignore whitespace 2456 break; 2457 } 2458 case $.GREATER_THAN_SIGN: { 2459 this.state = State.DATA; 2460 this.emitCurrentDoctype(token); 2461 break; 2462 } 2463 case $.EOF: { 2464 this._err(ERR.eofInDoctype); 2465 token.forceQuirks = true; 2466 this.emitCurrentDoctype(token); 2467 this._emitEOFToken(); 2468 break; 2469 } 2470 default: 2471 if (this._consumeSequenceIfMatch($$.PUBLIC, false)) { 2472 this.state = State.AFTER_DOCTYPE_PUBLIC_KEYWORD; 2473 } else if (this._consumeSequenceIfMatch($$.SYSTEM, false)) { 2474 this.state = State.AFTER_DOCTYPE_SYSTEM_KEYWORD; 2475 } 2476 //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup 2477 //results are no longer valid and we will need to start over. 2478 else if (!this._ensureHibernation()) { 2479 this._err(ERR.invalidCharacterSequenceAfterDoctypeName); 2480 token.forceQuirks = true; 2481 this.state = State.BOGUS_DOCTYPE; 2482 this._stateBogusDoctype(cp); 2483 } 2484 } 2485 } 2486 2487 // After DOCTYPE public keyword state 2488 //------------------------------------------------------------------ 2489 private _stateAfterDoctypePublicKeyword(cp: number): void { 2490 const token = this.currentToken as DoctypeToken; 2491 2492 switch (cp) { 2493 case $.SPACE: 2494 case $.LINE_FEED: 2495 case $.TABULATION: 2496 case $.FORM_FEED: { 2497 this.state = State.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; 2498 break; 2499 } 2500 case $.QUOTATION_MARK: { 2501 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); 2502 token.publicId = ''; 2503 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; 2504 break; 2505 } 2506 case $.APOSTROPHE: { 2507 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); 2508 token.publicId = ''; 2509 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; 2510 break; 2511 } 2512 case $.GREATER_THAN_SIGN: { 2513 this._err(ERR.missingDoctypePublicIdentifier); 2514 token.forceQuirks = true; 2515 this.state = State.DATA; 2516 this.emitCurrentDoctype(token); 2517 break; 2518 } 2519 case $.EOF: { 2520 this._err(ERR.eofInDoctype); 2521 token.forceQuirks = true; 2522 this.emitCurrentDoctype(token); 2523 this._emitEOFToken(); 2524 break; 2525 } 2526 default: { 2527 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); 2528 token.forceQuirks = true; 2529 this.state = State.BOGUS_DOCTYPE; 2530 this._stateBogusDoctype(cp); 2531 } 2532 } 2533 } 2534 2535 // Before DOCTYPE public identifier state 2536 //------------------------------------------------------------------ 2537 private _stateBeforeDoctypePublicIdentifier(cp: number): void { 2538 const token = this.currentToken as DoctypeToken; 2539 2540 switch (cp) { 2541 case $.SPACE: 2542 case $.LINE_FEED: 2543 case $.TABULATION: 2544 case $.FORM_FEED: { 2545 // Ignore whitespace 2546 break; 2547 } 2548 case $.QUOTATION_MARK: { 2549 token.publicId = ''; 2550 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; 2551 break; 2552 } 2553 case $.APOSTROPHE: { 2554 token.publicId = ''; 2555 this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; 2556 break; 2557 } 2558 case $.GREATER_THAN_SIGN: { 2559 this._err(ERR.missingDoctypePublicIdentifier); 2560 token.forceQuirks = true; 2561 this.state = State.DATA; 2562 this.emitCurrentDoctype(token); 2563 break; 2564 } 2565 case $.EOF: { 2566 this._err(ERR.eofInDoctype); 2567 token.forceQuirks = true; 2568 this.emitCurrentDoctype(token); 2569 this._emitEOFToken(); 2570 break; 2571 } 2572 default: { 2573 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); 2574 token.forceQuirks = true; 2575 this.state = State.BOGUS_DOCTYPE; 2576 this._stateBogusDoctype(cp); 2577 } 2578 } 2579 } 2580 2581 // DOCTYPE public identifier (double-quoted) state 2582 //------------------------------------------------------------------ 2583 private _stateDoctypePublicIdentifierDoubleQuoted(cp: number): void { 2584 const token = this.currentToken as DoctypeToken; 2585 2586 switch (cp) { 2587 case $.QUOTATION_MARK: { 2588 this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; 2589 break; 2590 } 2591 case $.NULL: { 2592 this._err(ERR.unexpectedNullCharacter); 2593 token.publicId += REPLACEMENT_CHARACTER; 2594 break; 2595 } 2596 case $.GREATER_THAN_SIGN: { 2597 this._err(ERR.abruptDoctypePublicIdentifier); 2598 token.forceQuirks = true; 2599 this.emitCurrentDoctype(token); 2600 this.state = State.DATA; 2601 break; 2602 } 2603 case $.EOF: { 2604 this._err(ERR.eofInDoctype); 2605 token.forceQuirks = true; 2606 this.emitCurrentDoctype(token); 2607 this._emitEOFToken(); 2608 break; 2609 } 2610 default: { 2611 token.publicId += String.fromCodePoint(cp); 2612 } 2613 } 2614 } 2615 2616 // DOCTYPE public identifier (single-quoted) state 2617 //------------------------------------------------------------------ 2618 private _stateDoctypePublicIdentifierSingleQuoted(cp: number): void { 2619 const token = this.currentToken as DoctypeToken; 2620 2621 switch (cp) { 2622 case $.APOSTROPHE: { 2623 this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; 2624 break; 2625 } 2626 case $.NULL: { 2627 this._err(ERR.unexpectedNullCharacter); 2628 token.publicId += REPLACEMENT_CHARACTER; 2629 break; 2630 } 2631 case $.GREATER_THAN_SIGN: { 2632 this._err(ERR.abruptDoctypePublicIdentifier); 2633 token.forceQuirks = true; 2634 this.emitCurrentDoctype(token); 2635 this.state = State.DATA; 2636 break; 2637 } 2638 case $.EOF: { 2639 this._err(ERR.eofInDoctype); 2640 token.forceQuirks = true; 2641 this.emitCurrentDoctype(token); 2642 this._emitEOFToken(); 2643 break; 2644 } 2645 default: { 2646 token.publicId += String.fromCodePoint(cp); 2647 } 2648 } 2649 } 2650 2651 // After DOCTYPE public identifier state 2652 //------------------------------------------------------------------ 2653 private _stateAfterDoctypePublicIdentifier(cp: number): void { 2654 const token = this.currentToken as DoctypeToken; 2655 2656 switch (cp) { 2657 case $.SPACE: 2658 case $.LINE_FEED: 2659 case $.TABULATION: 2660 case $.FORM_FEED: { 2661 this.state = State.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; 2662 break; 2663 } 2664 case $.GREATER_THAN_SIGN: { 2665 this.state = State.DATA; 2666 this.emitCurrentDoctype(token); 2667 break; 2668 } 2669 case $.QUOTATION_MARK: { 2670 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); 2671 token.systemId = ''; 2672 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2673 break; 2674 } 2675 case $.APOSTROPHE: { 2676 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); 2677 token.systemId = ''; 2678 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2679 break; 2680 } 2681 case $.EOF: { 2682 this._err(ERR.eofInDoctype); 2683 token.forceQuirks = true; 2684 this.emitCurrentDoctype(token); 2685 this._emitEOFToken(); 2686 break; 2687 } 2688 default: { 2689 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2690 token.forceQuirks = true; 2691 this.state = State.BOGUS_DOCTYPE; 2692 this._stateBogusDoctype(cp); 2693 } 2694 } 2695 } 2696 2697 // Between DOCTYPE public and system identifiers state 2698 //------------------------------------------------------------------ 2699 private _stateBetweenDoctypePublicAndSystemIdentifiers(cp: number): void { 2700 const token = this.currentToken as DoctypeToken; 2701 2702 switch (cp) { 2703 case $.SPACE: 2704 case $.LINE_FEED: 2705 case $.TABULATION: 2706 case $.FORM_FEED: { 2707 // Ignore whitespace 2708 break; 2709 } 2710 case $.GREATER_THAN_SIGN: { 2711 this.emitCurrentDoctype(token); 2712 this.state = State.DATA; 2713 break; 2714 } 2715 case $.QUOTATION_MARK: { 2716 token.systemId = ''; 2717 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2718 break; 2719 } 2720 case $.APOSTROPHE: { 2721 token.systemId = ''; 2722 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2723 break; 2724 } 2725 case $.EOF: { 2726 this._err(ERR.eofInDoctype); 2727 token.forceQuirks = true; 2728 this.emitCurrentDoctype(token); 2729 this._emitEOFToken(); 2730 break; 2731 } 2732 default: { 2733 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2734 token.forceQuirks = true; 2735 this.state = State.BOGUS_DOCTYPE; 2736 this._stateBogusDoctype(cp); 2737 } 2738 } 2739 } 2740 2741 // After DOCTYPE system keyword state 2742 //------------------------------------------------------------------ 2743 private _stateAfterDoctypeSystemKeyword(cp: number): void { 2744 const token = this.currentToken as DoctypeToken; 2745 2746 switch (cp) { 2747 case $.SPACE: 2748 case $.LINE_FEED: 2749 case $.TABULATION: 2750 case $.FORM_FEED: { 2751 this.state = State.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; 2752 break; 2753 } 2754 case $.QUOTATION_MARK: { 2755 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); 2756 token.systemId = ''; 2757 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2758 break; 2759 } 2760 case $.APOSTROPHE: { 2761 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); 2762 token.systemId = ''; 2763 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2764 break; 2765 } 2766 case $.GREATER_THAN_SIGN: { 2767 this._err(ERR.missingDoctypeSystemIdentifier); 2768 token.forceQuirks = true; 2769 this.state = State.DATA; 2770 this.emitCurrentDoctype(token); 2771 break; 2772 } 2773 case $.EOF: { 2774 this._err(ERR.eofInDoctype); 2775 token.forceQuirks = true; 2776 this.emitCurrentDoctype(token); 2777 this._emitEOFToken(); 2778 break; 2779 } 2780 default: { 2781 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2782 token.forceQuirks = true; 2783 this.state = State.BOGUS_DOCTYPE; 2784 this._stateBogusDoctype(cp); 2785 } 2786 } 2787 } 2788 2789 // Before DOCTYPE system identifier state 2790 //------------------------------------------------------------------ 2791 private _stateBeforeDoctypeSystemIdentifier(cp: number): void { 2792 const token = this.currentToken as DoctypeToken; 2793 2794 switch (cp) { 2795 case $.SPACE: 2796 case $.LINE_FEED: 2797 case $.TABULATION: 2798 case $.FORM_FEED: { 2799 // Ignore whitespace 2800 break; 2801 } 2802 case $.QUOTATION_MARK: { 2803 token.systemId = ''; 2804 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; 2805 break; 2806 } 2807 case $.APOSTROPHE: { 2808 token.systemId = ''; 2809 this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; 2810 break; 2811 } 2812 case $.GREATER_THAN_SIGN: { 2813 this._err(ERR.missingDoctypeSystemIdentifier); 2814 token.forceQuirks = true; 2815 this.state = State.DATA; 2816 this.emitCurrentDoctype(token); 2817 break; 2818 } 2819 case $.EOF: { 2820 this._err(ERR.eofInDoctype); 2821 token.forceQuirks = true; 2822 this.emitCurrentDoctype(token); 2823 this._emitEOFToken(); 2824 break; 2825 } 2826 default: { 2827 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); 2828 token.forceQuirks = true; 2829 this.state = State.BOGUS_DOCTYPE; 2830 this._stateBogusDoctype(cp); 2831 } 2832 } 2833 } 2834 2835 // DOCTYPE system identifier (double-quoted) state 2836 //------------------------------------------------------------------ 2837 private _stateDoctypeSystemIdentifierDoubleQuoted(cp: number): void { 2838 const token = this.currentToken as DoctypeToken; 2839 2840 switch (cp) { 2841 case $.QUOTATION_MARK: { 2842 this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; 2843 break; 2844 } 2845 case $.NULL: { 2846 this._err(ERR.unexpectedNullCharacter); 2847 token.systemId += REPLACEMENT_CHARACTER; 2848 break; 2849 } 2850 case $.GREATER_THAN_SIGN: { 2851 this._err(ERR.abruptDoctypeSystemIdentifier); 2852 token.forceQuirks = true; 2853 this.emitCurrentDoctype(token); 2854 this.state = State.DATA; 2855 break; 2856 } 2857 case $.EOF: { 2858 this._err(ERR.eofInDoctype); 2859 token.forceQuirks = true; 2860 this.emitCurrentDoctype(token); 2861 this._emitEOFToken(); 2862 break; 2863 } 2864 default: { 2865 token.systemId += String.fromCodePoint(cp); 2866 } 2867 } 2868 } 2869 2870 // DOCTYPE system identifier (single-quoted) state 2871 //------------------------------------------------------------------ 2872 private _stateDoctypeSystemIdentifierSingleQuoted(cp: number): void { 2873 const token = this.currentToken as DoctypeToken; 2874 2875 switch (cp) { 2876 case $.APOSTROPHE: { 2877 this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; 2878 break; 2879 } 2880 case $.NULL: { 2881 this._err(ERR.unexpectedNullCharacter); 2882 token.systemId += REPLACEMENT_CHARACTER; 2883 break; 2884 } 2885 case $.GREATER_THAN_SIGN: { 2886 this._err(ERR.abruptDoctypeSystemIdentifier); 2887 token.forceQuirks = true; 2888 this.emitCurrentDoctype(token); 2889 this.state = State.DATA; 2890 break; 2891 } 2892 case $.EOF: { 2893 this._err(ERR.eofInDoctype); 2894 token.forceQuirks = true; 2895 this.emitCurrentDoctype(token); 2896 this._emitEOFToken(); 2897 break; 2898 } 2899 default: { 2900 token.systemId += String.fromCodePoint(cp); 2901 } 2902 } 2903 } 2904 2905 // After DOCTYPE system identifier state 2906 //------------------------------------------------------------------ 2907 private _stateAfterDoctypeSystemIdentifier(cp: number): void { 2908 const token = this.currentToken as DoctypeToken; 2909 2910 switch (cp) { 2911 case $.SPACE: 2912 case $.LINE_FEED: 2913 case $.TABULATION: 2914 case $.FORM_FEED: { 2915 // Ignore whitespace 2916 break; 2917 } 2918 case $.GREATER_THAN_SIGN: { 2919 this.emitCurrentDoctype(token); 2920 this.state = State.DATA; 2921 break; 2922 } 2923 case $.EOF: { 2924 this._err(ERR.eofInDoctype); 2925 token.forceQuirks = true; 2926 this.emitCurrentDoctype(token); 2927 this._emitEOFToken(); 2928 break; 2929 } 2930 default: { 2931 this._err(ERR.unexpectedCharacterAfterDoctypeSystemIdentifier); 2932 this.state = State.BOGUS_DOCTYPE; 2933 this._stateBogusDoctype(cp); 2934 } 2935 } 2936 } 2937 2938 // Bogus DOCTYPE state 2939 //------------------------------------------------------------------ 2940 private _stateBogusDoctype(cp: number): void { 2941 const token = this.currentToken as DoctypeToken; 2942 2943 switch (cp) { 2944 case $.GREATER_THAN_SIGN: { 2945 this.emitCurrentDoctype(token); 2946 this.state = State.DATA; 2947 break; 2948 } 2949 case $.NULL: { 2950 this._err(ERR.unexpectedNullCharacter); 2951 break; 2952 } 2953 case $.EOF: { 2954 this.emitCurrentDoctype(token); 2955 this._emitEOFToken(); 2956 break; 2957 } 2958 default: 2959 // Do nothing 2960 } 2961 } 2962 2963 // CDATA section state 2964 //------------------------------------------------------------------ 2965 private _stateCdataSection(cp: number): void { 2966 switch (cp) { 2967 case $.RIGHT_SQUARE_BRACKET: { 2968 this.state = State.CDATA_SECTION_BRACKET; 2969 break; 2970 } 2971 case $.EOF: { 2972 this._err(ERR.eofInCdata); 2973 this._emitEOFToken(); 2974 break; 2975 } 2976 default: { 2977 this._emitCodePoint(cp); 2978 } 2979 } 2980 } 2981 2982 // CDATA section bracket state 2983 //------------------------------------------------------------------ 2984 private _stateCdataSectionBracket(cp: number): void { 2985 if (cp === $.RIGHT_SQUARE_BRACKET) { 2986 this.state = State.CDATA_SECTION_END; 2987 } else { 2988 this._emitChars(']'); 2989 this.state = State.CDATA_SECTION; 2990 this._stateCdataSection(cp); 2991 } 2992 } 2993 2994 // CDATA section end state 2995 //------------------------------------------------------------------ 2996 private _stateCdataSectionEnd(cp: number): void { 2997 switch (cp) { 2998 case $.GREATER_THAN_SIGN: { 2999 this.state = State.DATA; 3000 break; 3001 } 3002 case $.RIGHT_SQUARE_BRACKET: { 3003 this._emitChars(']'); 3004 break; 3005 } 3006 default: { 3007 this._emitChars(']]'); 3008 this.state = State.CDATA_SECTION; 3009 this._stateCdataSection(cp); 3010 } 3011 } 3012 } 3013 3014 // Character reference state 3015 //------------------------------------------------------------------ 3016 private _stateCharacterReference(cp: number): void { 3017 if (cp === $.NUMBER_SIGN) { 3018 this.state = State.NUMERIC_CHARACTER_REFERENCE; 3019 } else if (isAsciiAlphaNumeric(cp)) { 3020 this.state = State.NAMED_CHARACTER_REFERENCE; 3021 this._stateNamedCharacterReference(cp); 3022 } else { 3023 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3024 this._reconsumeInState(this.returnState); 3025 } 3026 } 3027 3028 // Named character reference state 3029 //------------------------------------------------------------------ 3030 private _stateNamedCharacterReference(cp: number): void { 3031 const matchResult = this._matchNamedCharacterReference(cp); 3032 3033 //NOTE: Matching can be abrupted by hibernation. In that case, match 3034 //results are no longer valid and we will need to start over. 3035 if (this._ensureHibernation()) { 3036 // Stay in the state, try again. 3037 } else if (matchResult) { 3038 for (let i = 0; i < matchResult.length; i++) { 3039 this._flushCodePointConsumedAsCharacterReference(matchResult[i]); 3040 } 3041 this.state = this.returnState; 3042 } else { 3043 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3044 this.state = State.AMBIGUOUS_AMPERSAND; 3045 } 3046 } 3047 3048 // Ambiguos ampersand state 3049 //------------------------------------------------------------------ 3050 private _stateAmbiguousAmpersand(cp: number): void { 3051 if (isAsciiAlphaNumeric(cp)) { 3052 this._flushCodePointConsumedAsCharacterReference(cp); 3053 } else { 3054 if (cp === $.SEMICOLON) { 3055 this._err(ERR.unknownNamedCharacterReference); 3056 } 3057 3058 this._reconsumeInState(this.returnState); 3059 } 3060 } 3061 3062 // Numeric character reference state 3063 //------------------------------------------------------------------ 3064 private _stateNumericCharacterReference(cp: number): void { 3065 this.charRefCode = 0; 3066 3067 if (cp === $.LATIN_SMALL_X || cp === $.LATIN_CAPITAL_X) { 3068 this.state = State.HEXADEMICAL_CHARACTER_REFERENCE_START; 3069 } else { 3070 this.state = State.DECIMAL_CHARACTER_REFERENCE_START; 3071 this._stateDecimalCharacterReferenceStart(cp); 3072 } 3073 } 3074 3075 // Hexademical character reference start state 3076 //------------------------------------------------------------------ 3077 private _stateHexademicalCharacterReferenceStart(cp: number): void { 3078 if (isAsciiHexDigit(cp)) { 3079 this.state = State.HEXADEMICAL_CHARACTER_REFERENCE; 3080 this._stateHexademicalCharacterReference(cp); 3081 } else { 3082 this._err(ERR.absenceOfDigitsInNumericCharacterReference); 3083 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3084 this._flushCodePointConsumedAsCharacterReference($.NUMBER_SIGN); 3085 this._unconsume(2); 3086 this.state = this.returnState; 3087 } 3088 } 3089 3090 // Decimal character reference start state 3091 //------------------------------------------------------------------ 3092 private _stateDecimalCharacterReferenceStart(cp: number): void { 3093 if (isAsciiDigit(cp)) { 3094 this.state = State.DECIMAL_CHARACTER_REFERENCE; 3095 this._stateDecimalCharacterReference(cp); 3096 } else { 3097 this._err(ERR.absenceOfDigitsInNumericCharacterReference); 3098 this._flushCodePointConsumedAsCharacterReference($.AMPERSAND); 3099 this._flushCodePointConsumedAsCharacterReference($.NUMBER_SIGN); 3100 this._reconsumeInState(this.returnState); 3101 } 3102 } 3103 3104 // Hexademical character reference state 3105 //------------------------------------------------------------------ 3106 private _stateHexademicalCharacterReference(cp: number): void { 3107 if (isAsciiUpperHexDigit(cp)) { 3108 this.charRefCode = this.charRefCode * 16 + cp - 0x37; 3109 } else if (isAsciiLowerHexDigit(cp)) { 3110 this.charRefCode = this.charRefCode * 16 + cp - 0x57; 3111 } else if (isAsciiDigit(cp)) { 3112 this.charRefCode = this.charRefCode * 16 + cp - 0x30; 3113 } else if (cp === $.SEMICOLON) { 3114 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3115 } else { 3116 this._err(ERR.missingSemicolonAfterCharacterReference); 3117 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3118 this._stateNumericCharacterReferenceEnd(); 3119 } 3120 } 3121 3122 // Decimal character reference state 3123 //------------------------------------------------------------------ 3124 private _stateDecimalCharacterReference(cp: number): void { 3125 if (isAsciiDigit(cp)) { 3126 this.charRefCode = this.charRefCode * 10 + cp - 0x30; 3127 } else if (cp === $.SEMICOLON) { 3128 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3129 } else { 3130 this._err(ERR.missingSemicolonAfterCharacterReference); 3131 this.state = State.NUMERIC_CHARACTER_REFERENCE_END; 3132 this._stateNumericCharacterReferenceEnd(); 3133 } 3134 } 3135 3136 // Numeric character reference end state 3137 //------------------------------------------------------------------ 3138 private _stateNumericCharacterReferenceEnd(): void { 3139 if (this.charRefCode === $.NULL) { 3140 this._err(ERR.nullCharacterReference); 3141 this.charRefCode = $.REPLACEMENT_CHARACTER; 3142 } else if (this.charRefCode > 0x10_ff_ff) { 3143 this._err(ERR.characterReferenceOutsideUnicodeRange); 3144 this.charRefCode = $.REPLACEMENT_CHARACTER; 3145 } else if (isSurrogate(this.charRefCode)) { 3146 this._err(ERR.surrogateCharacterReference); 3147 this.charRefCode = $.REPLACEMENT_CHARACTER; 3148 } else if (isUndefinedCodePoint(this.charRefCode)) { 3149 this._err(ERR.noncharacterCharacterReference); 3150 } else if (isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) { 3151 this._err(ERR.controlCharacterReference); 3152 3153 const replacement = C1_CONTROLS_REFERENCE_REPLACEMENTS.get(this.charRefCode); 3154 3155 if (replacement !== undefined) { 3156 this.charRefCode = replacement; 3157 } 3158 } 3159 3160 this._flushCodePointConsumedAsCharacterReference(this.charRefCode); 3161 this._reconsumeInState(this.returnState); 3162 } 3163} 3164 3165function checkselfClosingNode(parse: Tokenizer, token: TagToken) { 3166 const tagName: string = (token.tagName || "").toLowerCase(); 3167 const selfClosing: boolean = token.selfClosing; 3168 const flag: boolean = parse.validator.isSupportedSelfClosing(tagName); 3169 if (parse.nodeInfo.tn && tagName && !parse.nodeInfo.sc) { 3170 const loc: string = 3171 String(token.location?.startLine) + ',' + String(token.location?.startCol); 3172 if ( 3173 !flag || 3174 (loc !== parse.nodeInfo.pos && token.type === TokenType.START_TAG) 3175 ) { 3176 const posInfo: string = parse.nodeInfo.pos; 3177 const posArr: string[] = posInfo.split(','); 3178 parse.compileResult.log.push({ 3179 line: Number(posArr[0]) || 1, 3180 column: Number(posArr[1]) || 1, 3181 reason: 'ERROR: tag `' + parse.nodeInfo.tn + '` must be closed, please follow norm', 3182 }); 3183 parse.nodeInfo = { tn: '', sc: false, pos: '' }; 3184 } 3185 } 3186 if (tagName && flag) { 3187 if (token.type === TokenType.START_TAG && !selfClosing) { 3188 parse.nodeInfo.tn = tagName; 3189 parse.nodeInfo.sc = false; 3190 parse.nodeInfo.pos = 3191 String(token.location?.startLine) + ',' + String(token.location?.startCol); 3192 } 3193 if ( 3194 token.type === TokenType.END_TAG && tagName === parse.nodeInfo.tn 3195 ) { 3196 parse.nodeInfo.sc = true; 3197 } 3198 } 3199 if (!flag && selfClosing && token.type === TokenType.START_TAG) { 3200 parse.compileResult.log.push({ 3201 line: token.location?.startLine || 1, 3202 column: token.location?.startCol || 1, 3203 reason: "ERROR: tag `" + tagName + "` can not use selfClosing", 3204 }); 3205 } 3206}