1 package org.jsoup.parser; 2 3 import org.jsoup.nodes.DocumentType; 4 5 /** 6 * States and transition activations for the Tokeniser. 7 */ 8 enum TokeniserState { 9 Data { 10 // in data state, gather characters until a character reference or tag is found read(Tokeniser t, CharacterReader r)11 @Override void read(Tokeniser t, CharacterReader r) { 12 switch (r.current()) { 13 case '&': 14 t.advanceTransition(CharacterReferenceInData); 15 break; 16 case '<': 17 t.advanceTransition(TagOpen); 18 break; 19 case nullChar: 20 t.error(this); // NOT replacement character (oddly?) 21 t.emit(r.consume()); 22 break; 23 case eof: 24 t.emit(new Token.EOF()); 25 break; 26 default: 27 String data = r.consumeData(); 28 t.emit(data); 29 break; 30 } 31 } 32 }, 33 CharacterReferenceInData { 34 // from & in data read(Tokeniser t, CharacterReader r)35 @Override void read(Tokeniser t, CharacterReader r) { 36 readCharRef(t, Data); 37 } 38 }, 39 Rcdata { 40 /// handles data in title, textarea etc read(Tokeniser t, CharacterReader r)41 @Override void read(Tokeniser t, CharacterReader r) { 42 switch (r.current()) { 43 case '&': 44 t.advanceTransition(CharacterReferenceInRcdata); 45 break; 46 case '<': 47 t.advanceTransition(RcdataLessthanSign); 48 break; 49 case nullChar: 50 t.error(this); 51 r.advance(); 52 t.emit(replacementChar); 53 break; 54 case eof: 55 t.emit(new Token.EOF()); 56 break; 57 default: 58 String data = r.consumeData(); 59 t.emit(data); 60 break; 61 } 62 } 63 }, 64 CharacterReferenceInRcdata { read(Tokeniser t, CharacterReader r)65 @Override void read(Tokeniser t, CharacterReader r) { 66 readCharRef(t, Rcdata); 67 } 68 }, 69 Rawtext { read(Tokeniser t, CharacterReader r)70 @Override void read(Tokeniser t, CharacterReader r) { 71 readRawData(t, r, this, RawtextLessthanSign); 72 } 73 }, 74 ScriptData { read(Tokeniser t, CharacterReader r)75 @Override void read(Tokeniser t, CharacterReader r) { 76 readRawData(t, r, this, ScriptDataLessthanSign); 77 } 78 }, 79 PLAINTEXT { read(Tokeniser t, CharacterReader r)80 @Override void read(Tokeniser t, CharacterReader r) { 81 switch (r.current()) { 82 case nullChar: 83 t.error(this); 84 r.advance(); 85 t.emit(replacementChar); 86 break; 87 case eof: 88 t.emit(new Token.EOF()); 89 break; 90 default: 91 String data = r.consumeTo(nullChar); 92 t.emit(data); 93 break; 94 } 95 } 96 }, 97 TagOpen { 98 // from < in data read(Tokeniser t, CharacterReader r)99 @Override void read(Tokeniser t, CharacterReader r) { 100 switch (r.current()) { 101 case '!': 102 t.advanceTransition(MarkupDeclarationOpen); 103 break; 104 case '/': 105 t.advanceTransition(EndTagOpen); 106 break; 107 case '?': 108 t.createBogusCommentPending(); 109 t.transition(BogusComment); 110 break; 111 default: 112 if (r.matchesAsciiAlpha()) { 113 t.createTagPending(true); 114 t.transition(TagName); 115 } else { 116 t.error(this); 117 t.emit('<'); // char that got us here 118 t.transition(Data); 119 } 120 break; 121 } 122 } 123 }, 124 EndTagOpen { read(Tokeniser t, CharacterReader r)125 @Override void read(Tokeniser t, CharacterReader r) { 126 if (r.isEmpty()) { 127 t.eofError(this); 128 t.emit("</"); 129 t.transition(Data); 130 } else if (r.matchesAsciiAlpha()) { 131 t.createTagPending(false); 132 t.transition(TagName); 133 } else if (r.matches('>')) { 134 t.error(this); 135 t.advanceTransition(Data); 136 } else { 137 t.error(this); 138 t.createBogusCommentPending(); 139 t.commentPending.append('/'); // push the / back on that got us here 140 t.transition(BogusComment); 141 } 142 } 143 }, 144 TagName { 145 // from < or </ in data, will have start or end tag pending read(Tokeniser t, CharacterReader r)146 @Override void read(Tokeniser t, CharacterReader r) { 147 // previous TagOpen state did NOT consume, will have a letter char in current 148 String tagName = r.consumeTagName(); 149 t.tagPending.appendTagName(tagName); 150 151 char c = r.consume(); 152 switch (c) { 153 case '\t': 154 case '\n': 155 case '\r': 156 case '\f': 157 case ' ': 158 t.transition(BeforeAttributeName); 159 break; 160 case '/': 161 t.transition(SelfClosingStartTag); 162 break; 163 case '<': // NOTE: out of spec, but clear author intent 164 r.unconsume(); 165 t.error(this); 166 // intended fall through to next > 167 case '>': 168 t.emitTagPending(); 169 t.transition(Data); 170 break; 171 case nullChar: // replacement 172 t.tagPending.appendTagName(replacementStr); 173 break; 174 case eof: // should emit pending tag? 175 t.eofError(this); 176 t.transition(Data); 177 break; 178 default: // buffer underrun 179 t.tagPending.appendTagName(c); 180 } 181 } 182 }, 183 RcdataLessthanSign { 184 // from < in rcdata read(Tokeniser t, CharacterReader r)185 @Override void read(Tokeniser t, CharacterReader r) { 186 if (r.matches('/')) { 187 t.createTempBuffer(); 188 t.advanceTransition(RCDATAEndTagOpen); 189 } else if (r.readFully() && r.matchesAsciiAlpha() && t.appropriateEndTagName() != null && !r.containsIgnoreCase(t.appropriateEndTagSeq())) { 190 // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than 191 // consuming to EOF; break out here 192 t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName()); 193 t.emitTagPending(); 194 t.transition(TagOpen); // straight into TagOpen, as we came from < and looks like we're on a start tag 195 } else { 196 t.emit("<"); 197 t.transition(Rcdata); 198 } 199 } 200 }, 201 RCDATAEndTagOpen { read(Tokeniser t, CharacterReader r)202 @Override void read(Tokeniser t, CharacterReader r) { 203 if (r.matchesAsciiAlpha()) { 204 t.createTagPending(false); 205 t.tagPending.appendTagName(r.current()); 206 t.dataBuffer.append(r.current()); 207 t.advanceTransition(RCDATAEndTagName); 208 } else { 209 t.emit("</"); 210 t.transition(Rcdata); 211 } 212 } 213 }, 214 RCDATAEndTagName { read(Tokeniser t, CharacterReader r)215 @Override void read(Tokeniser t, CharacterReader r) { 216 if (r.matchesAsciiAlpha()) { 217 String name = r.consumeLetterSequence(); 218 t.tagPending.appendTagName(name); 219 t.dataBuffer.append(name); 220 return; 221 } 222 223 char c = r.consume(); 224 switch (c) { 225 case '\t': 226 case '\n': 227 case '\r': 228 case '\f': 229 case ' ': 230 if (t.isAppropriateEndTagToken()) 231 t.transition(BeforeAttributeName); 232 else 233 anythingElse(t, r); 234 break; 235 case '/': 236 if (t.isAppropriateEndTagToken()) 237 t.transition(SelfClosingStartTag); 238 else 239 anythingElse(t, r); 240 break; 241 case '>': 242 if (t.isAppropriateEndTagToken()) { 243 t.emitTagPending(); 244 t.transition(Data); 245 } 246 else 247 anythingElse(t, r); 248 break; 249 default: 250 anythingElse(t, r); 251 } 252 } 253 anythingElse(Tokeniser t, CharacterReader r)254 private void anythingElse(Tokeniser t, CharacterReader r) { 255 t.emit("</"); 256 t.emit(t.dataBuffer); 257 r.unconsume(); 258 t.transition(Rcdata); 259 } 260 }, 261 RawtextLessthanSign { read(Tokeniser t, CharacterReader r)262 @Override void read(Tokeniser t, CharacterReader r) { 263 if (r.matches('/')) { 264 t.createTempBuffer(); 265 t.advanceTransition(RawtextEndTagOpen); 266 } else { 267 t.emit('<'); 268 t.transition(Rawtext); 269 } 270 } 271 }, 272 RawtextEndTagOpen { read(Tokeniser t, CharacterReader r)273 @Override void read(Tokeniser t, CharacterReader r) { 274 readEndTag(t, r, RawtextEndTagName, Rawtext); 275 } 276 }, 277 RawtextEndTagName { read(Tokeniser t, CharacterReader r)278 @Override void read(Tokeniser t, CharacterReader r) { 279 handleDataEndTag(t, r, Rawtext); 280 } 281 }, 282 ScriptDataLessthanSign { read(Tokeniser t, CharacterReader r)283 @Override void read(Tokeniser t, CharacterReader r) { 284 switch (r.consume()) { 285 case '/': 286 t.createTempBuffer(); 287 t.transition(ScriptDataEndTagOpen); 288 break; 289 case '!': 290 t.emit("<!"); 291 t.transition(ScriptDataEscapeStart); 292 break; 293 case eof: 294 t.emit("<"); 295 t.eofError(this); 296 t.transition(Data); 297 break; 298 default: 299 t.emit("<"); 300 r.unconsume(); 301 t.transition(ScriptData); 302 } 303 } 304 }, 305 ScriptDataEndTagOpen { read(Tokeniser t, CharacterReader r)306 @Override void read(Tokeniser t, CharacterReader r) { 307 readEndTag(t, r, ScriptDataEndTagName, ScriptData); 308 } 309 }, 310 ScriptDataEndTagName { read(Tokeniser t, CharacterReader r)311 @Override void read(Tokeniser t, CharacterReader r) { 312 handleDataEndTag(t, r, ScriptData); 313 } 314 }, 315 ScriptDataEscapeStart { read(Tokeniser t, CharacterReader r)316 @Override void read(Tokeniser t, CharacterReader r) { 317 if (r.matches('-')) { 318 t.emit('-'); 319 t.advanceTransition(ScriptDataEscapeStartDash); 320 } else { 321 t.transition(ScriptData); 322 } 323 } 324 }, 325 ScriptDataEscapeStartDash { read(Tokeniser t, CharacterReader r)326 @Override void read(Tokeniser t, CharacterReader r) { 327 if (r.matches('-')) { 328 t.emit('-'); 329 t.advanceTransition(ScriptDataEscapedDashDash); 330 } else { 331 t.transition(ScriptData); 332 } 333 } 334 }, 335 ScriptDataEscaped { read(Tokeniser t, CharacterReader r)336 @Override void read(Tokeniser t, CharacterReader r) { 337 if (r.isEmpty()) { 338 t.eofError(this); 339 t.transition(Data); 340 return; 341 } 342 343 switch (r.current()) { 344 case '-': 345 t.emit('-'); 346 t.advanceTransition(ScriptDataEscapedDash); 347 break; 348 case '<': 349 t.advanceTransition(ScriptDataEscapedLessthanSign); 350 break; 351 case nullChar: 352 t.error(this); 353 r.advance(); 354 t.emit(replacementChar); 355 break; 356 default: 357 String data = r.consumeToAny('-', '<', nullChar); 358 t.emit(data); 359 } 360 } 361 }, 362 ScriptDataEscapedDash { read(Tokeniser t, CharacterReader r)363 @Override void read(Tokeniser t, CharacterReader r) { 364 if (r.isEmpty()) { 365 t.eofError(this); 366 t.transition(Data); 367 return; 368 } 369 370 char c = r.consume(); 371 switch (c) { 372 case '-': 373 t.emit(c); 374 t.transition(ScriptDataEscapedDashDash); 375 break; 376 case '<': 377 t.transition(ScriptDataEscapedLessthanSign); 378 break; 379 case nullChar: 380 t.error(this); 381 t.emit(replacementChar); 382 t.transition(ScriptDataEscaped); 383 break; 384 default: 385 t.emit(c); 386 t.transition(ScriptDataEscaped); 387 } 388 } 389 }, 390 ScriptDataEscapedDashDash { read(Tokeniser t, CharacterReader r)391 @Override void read(Tokeniser t, CharacterReader r) { 392 if (r.isEmpty()) { 393 t.eofError(this); 394 t.transition(Data); 395 return; 396 } 397 398 char c = r.consume(); 399 switch (c) { 400 case '-': 401 t.emit(c); 402 break; 403 case '<': 404 t.transition(ScriptDataEscapedLessthanSign); 405 break; 406 case '>': 407 t.emit(c); 408 t.transition(ScriptData); 409 break; 410 case nullChar: 411 t.error(this); 412 t.emit(replacementChar); 413 t.transition(ScriptDataEscaped); 414 break; 415 default: 416 t.emit(c); 417 t.transition(ScriptDataEscaped); 418 } 419 } 420 }, 421 ScriptDataEscapedLessthanSign { read(Tokeniser t, CharacterReader r)422 @Override void read(Tokeniser t, CharacterReader r) { 423 if (r.matchesAsciiAlpha()) { 424 t.createTempBuffer(); 425 t.dataBuffer.append(r.current()); 426 t.emit("<"); 427 t.emit(r.current()); 428 t.advanceTransition(ScriptDataDoubleEscapeStart); 429 } else if (r.matches('/')) { 430 t.createTempBuffer(); 431 t.advanceTransition(ScriptDataEscapedEndTagOpen); 432 } else { 433 t.emit('<'); 434 t.transition(ScriptDataEscaped); 435 } 436 } 437 }, 438 ScriptDataEscapedEndTagOpen { read(Tokeniser t, CharacterReader r)439 @Override void read(Tokeniser t, CharacterReader r) { 440 if (r.matchesAsciiAlpha()) { 441 t.createTagPending(false); 442 t.tagPending.appendTagName(r.current()); 443 t.dataBuffer.append(r.current()); 444 t.advanceTransition(ScriptDataEscapedEndTagName); 445 } else { 446 t.emit("</"); 447 t.transition(ScriptDataEscaped); 448 } 449 } 450 }, 451 ScriptDataEscapedEndTagName { read(Tokeniser t, CharacterReader r)452 @Override void read(Tokeniser t, CharacterReader r) { 453 handleDataEndTag(t, r, ScriptDataEscaped); 454 } 455 }, 456 ScriptDataDoubleEscapeStart { read(Tokeniser t, CharacterReader r)457 @Override void read(Tokeniser t, CharacterReader r) { 458 handleDataDoubleEscapeTag(t, r, ScriptDataDoubleEscaped, ScriptDataEscaped); 459 } 460 }, 461 ScriptDataDoubleEscaped { read(Tokeniser t, CharacterReader r)462 @Override void read(Tokeniser t, CharacterReader r) { 463 char c = r.current(); 464 switch (c) { 465 case '-': 466 t.emit(c); 467 t.advanceTransition(ScriptDataDoubleEscapedDash); 468 break; 469 case '<': 470 t.emit(c); 471 t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); 472 break; 473 case nullChar: 474 t.error(this); 475 r.advance(); 476 t.emit(replacementChar); 477 break; 478 case eof: 479 t.eofError(this); 480 t.transition(Data); 481 break; 482 default: 483 String data = r.consumeToAny('-', '<', nullChar); 484 t.emit(data); 485 } 486 } 487 }, 488 ScriptDataDoubleEscapedDash { read(Tokeniser t, CharacterReader r)489 @Override void read(Tokeniser t, CharacterReader r) { 490 char c = r.consume(); 491 switch (c) { 492 case '-': 493 t.emit(c); 494 t.transition(ScriptDataDoubleEscapedDashDash); 495 break; 496 case '<': 497 t.emit(c); 498 t.transition(ScriptDataDoubleEscapedLessthanSign); 499 break; 500 case nullChar: 501 t.error(this); 502 t.emit(replacementChar); 503 t.transition(ScriptDataDoubleEscaped); 504 break; 505 case eof: 506 t.eofError(this); 507 t.transition(Data); 508 break; 509 default: 510 t.emit(c); 511 t.transition(ScriptDataDoubleEscaped); 512 } 513 } 514 }, 515 ScriptDataDoubleEscapedDashDash { read(Tokeniser t, CharacterReader r)516 @Override void read(Tokeniser t, CharacterReader r) { 517 char c = r.consume(); 518 switch (c) { 519 case '-': 520 t.emit(c); 521 break; 522 case '<': 523 t.emit(c); 524 t.transition(ScriptDataDoubleEscapedLessthanSign); 525 break; 526 case '>': 527 t.emit(c); 528 t.transition(ScriptData); 529 break; 530 case nullChar: 531 t.error(this); 532 t.emit(replacementChar); 533 t.transition(ScriptDataDoubleEscaped); 534 break; 535 case eof: 536 t.eofError(this); 537 t.transition(Data); 538 break; 539 default: 540 t.emit(c); 541 t.transition(ScriptDataDoubleEscaped); 542 } 543 } 544 }, 545 ScriptDataDoubleEscapedLessthanSign { read(Tokeniser t, CharacterReader r)546 @Override void read(Tokeniser t, CharacterReader r) { 547 if (r.matches('/')) { 548 t.emit('/'); 549 t.createTempBuffer(); 550 t.advanceTransition(ScriptDataDoubleEscapeEnd); 551 } else { 552 t.transition(ScriptDataDoubleEscaped); 553 } 554 } 555 }, 556 ScriptDataDoubleEscapeEnd { read(Tokeniser t, CharacterReader r)557 @Override void read(Tokeniser t, CharacterReader r) { 558 handleDataDoubleEscapeTag(t,r, ScriptDataEscaped, ScriptDataDoubleEscaped); 559 } 560 }, 561 BeforeAttributeName { 562 // from tagname <xxx read(Tokeniser t, CharacterReader r)563 @Override void read(Tokeniser t, CharacterReader r) { 564 char c = r.consume(); 565 switch (c) { 566 case '\t': 567 case '\n': 568 case '\r': 569 case '\f': 570 case ' ': 571 break; // ignore whitespace 572 case '/': 573 t.transition(SelfClosingStartTag); 574 break; 575 case '<': // NOTE: out of spec, but clear (spec has this as a part of the attribute name) 576 r.unconsume(); 577 t.error(this); 578 // intended fall through as if > 579 case '>': 580 t.emitTagPending(); 581 t.transition(Data); 582 break; 583 case nullChar: 584 r.unconsume(); 585 t.error(this); 586 t.tagPending.newAttribute(); 587 t.transition(AttributeName); 588 break; 589 case eof: 590 t.eofError(this); 591 t.transition(Data); 592 break; 593 case '"': 594 case '\'': 595 case '=': 596 t.error(this); 597 t.tagPending.newAttribute(); 598 t.tagPending.appendAttributeName(c, r.pos()-1, r.pos()); 599 t.transition(AttributeName); 600 break; 601 default: // A-Z, anything else 602 t.tagPending.newAttribute(); 603 r.unconsume(); 604 t.transition(AttributeName); 605 } 606 } 607 }, 608 AttributeName { 609 // from before attribute name read(Tokeniser t, CharacterReader r)610 @Override void read(Tokeniser t, CharacterReader r) { 611 int pos = r.pos(); 612 String name = r.consumeToAnySorted(attributeNameCharsSorted); // spec deviate - consume and emit nulls in one hit vs stepping 613 t.tagPending.appendAttributeName(name, pos, r.pos()); 614 615 pos = r.pos(); 616 char c = r.consume(); 617 switch (c) { 618 case '\t': 619 case '\n': 620 case '\r': 621 case '\f': 622 case ' ': 623 t.transition(AfterAttributeName); 624 break; 625 case '/': 626 t.transition(SelfClosingStartTag); 627 break; 628 case '=': 629 t.transition(BeforeAttributeValue); 630 break; 631 case '>': 632 t.emitTagPending(); 633 t.transition(Data); 634 break; 635 case eof: 636 t.eofError(this); 637 t.transition(Data); 638 break; 639 case '"': 640 case '\'': 641 case '<': 642 t.error(this); 643 t.tagPending.appendAttributeName(c, pos, r.pos()); 644 break; 645 default: // buffer underrun 646 t.tagPending.appendAttributeName(c, pos, r.pos()); 647 } 648 } 649 }, 650 AfterAttributeName { read(Tokeniser t, CharacterReader r)651 @Override void read(Tokeniser t, CharacterReader r) { 652 char c = r.consume(); 653 switch (c) { 654 case '\t': 655 case '\n': 656 case '\r': 657 case '\f': 658 case ' ': 659 // ignore 660 break; 661 case '/': 662 t.transition(SelfClosingStartTag); 663 break; 664 case '=': 665 t.transition(BeforeAttributeValue); 666 break; 667 case '>': 668 t.emitTagPending(); 669 t.transition(Data); 670 break; 671 case nullChar: 672 t.error(this); 673 t.tagPending.appendAttributeName(replacementChar, r.pos()-1, r.pos()); 674 t.transition(AttributeName); 675 break; 676 case eof: 677 t.eofError(this); 678 t.transition(Data); 679 break; 680 case '"': 681 case '\'': 682 case '<': 683 t.error(this); 684 t.tagPending.newAttribute(); 685 t.tagPending.appendAttributeName(c, r.pos()-1, r.pos()); 686 t.transition(AttributeName); 687 break; 688 default: // A-Z, anything else 689 t.tagPending.newAttribute(); 690 r.unconsume(); 691 t.transition(AttributeName); 692 } 693 } 694 }, 695 BeforeAttributeValue { read(Tokeniser t, CharacterReader r)696 @Override void read(Tokeniser t, CharacterReader r) { 697 char c = r.consume(); 698 switch (c) { 699 case '\t': 700 case '\n': 701 case '\r': 702 case '\f': 703 case ' ': 704 // ignore 705 break; 706 case '"': 707 t.transition(AttributeValue_doubleQuoted); 708 break; 709 case '&': 710 r.unconsume(); 711 t.transition(AttributeValue_unquoted); 712 break; 713 case '\'': 714 t.transition(AttributeValue_singleQuoted); 715 break; 716 case nullChar: 717 t.error(this); 718 t.tagPending.appendAttributeValue(replacementChar, r.pos()-1, r.pos()); 719 t.transition(AttributeValue_unquoted); 720 break; 721 case eof: 722 t.eofError(this); 723 t.emitTagPending(); 724 t.transition(Data); 725 break; 726 case '>': 727 t.error(this); 728 t.emitTagPending(); 729 t.transition(Data); 730 break; 731 case '<': 732 case '=': 733 case '`': 734 t.error(this); 735 t.tagPending.appendAttributeValue(c, r.pos()-1, r.pos()); 736 t.transition(AttributeValue_unquoted); 737 break; 738 default: 739 r.unconsume(); 740 t.transition(AttributeValue_unquoted); 741 } 742 } 743 }, 744 AttributeValue_doubleQuoted { read(Tokeniser t, CharacterReader r)745 @Override void read(Tokeniser t, CharacterReader r) { 746 int pos = r.pos(); 747 String value = r.consumeAttributeQuoted(false); 748 if (value.length() > 0) 749 t.tagPending.appendAttributeValue(value, pos, r.pos()); 750 else 751 t.tagPending.setEmptyAttributeValue(); 752 753 pos = r.pos(); 754 char c = r.consume(); 755 switch (c) { 756 case '"': 757 t.transition(AfterAttributeValue_quoted); 758 break; 759 case '&': 760 int[] ref = t.consumeCharacterReference('"', true); 761 if (ref != null) 762 t.tagPending.appendAttributeValue(ref, pos, r.pos()); 763 else 764 t.tagPending.appendAttributeValue('&', pos, r.pos()); 765 break; 766 case nullChar: 767 t.error(this); 768 t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); 769 break; 770 case eof: 771 t.eofError(this); 772 t.transition(Data); 773 break; 774 default: // hit end of buffer in first read, still in attribute 775 t.tagPending.appendAttributeValue(c, pos, r.pos()); 776 } 777 } 778 }, 779 AttributeValue_singleQuoted { read(Tokeniser t, CharacterReader r)780 @Override void read(Tokeniser t, CharacterReader r) { 781 int pos = r.pos(); 782 String value = r.consumeAttributeQuoted(true); 783 if (value.length() > 0) 784 t.tagPending.appendAttributeValue(value, pos, r.pos()); 785 else 786 t.tagPending.setEmptyAttributeValue(); 787 788 pos = r.pos(); 789 char c = r.consume(); 790 switch (c) { 791 case '\'': 792 t.transition(AfterAttributeValue_quoted); 793 break; 794 case '&': 795 int[] ref = t.consumeCharacterReference('\'', true); 796 if (ref != null) 797 t.tagPending.appendAttributeValue(ref, pos, r.pos()); 798 else 799 t.tagPending.appendAttributeValue('&', pos, r.pos()); 800 break; 801 case nullChar: 802 t.error(this); 803 t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); 804 break; 805 case eof: 806 t.eofError(this); 807 t.transition(Data); 808 break; 809 default: // hit end of buffer in first read, still in attribute 810 t.tagPending.appendAttributeValue(c, pos, r.pos()); 811 } 812 } 813 }, 814 AttributeValue_unquoted { read(Tokeniser t, CharacterReader r)815 @Override void read(Tokeniser t, CharacterReader r) { 816 int pos = r.pos(); 817 String value = r.consumeToAnySorted(attributeValueUnquoted); 818 if (value.length() > 0) 819 t.tagPending.appendAttributeValue(value, pos, r.pos()); 820 821 pos = r.pos(); 822 char c = r.consume(); 823 switch (c) { 824 case '\t': 825 case '\n': 826 case '\r': 827 case '\f': 828 case ' ': 829 t.transition(BeforeAttributeName); 830 break; 831 case '&': 832 int[] ref = t.consumeCharacterReference('>', true); 833 if (ref != null) 834 t.tagPending.appendAttributeValue(ref, pos, r.pos()); 835 else 836 t.tagPending.appendAttributeValue('&', pos, r.pos()); 837 break; 838 case '>': 839 t.emitTagPending(); 840 t.transition(Data); 841 break; 842 case nullChar: 843 t.error(this); 844 t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); 845 break; 846 case eof: 847 t.eofError(this); 848 t.transition(Data); 849 break; 850 case '"': 851 case '\'': 852 case '<': 853 case '=': 854 case '`': 855 t.error(this); 856 t.tagPending.appendAttributeValue(c, pos, r.pos()); 857 break; 858 default: // hit end of buffer in first read, still in attribute 859 t.tagPending.appendAttributeValue(c, pos, r.pos()); 860 } 861 862 } 863 }, 864 // CharacterReferenceInAttributeValue state handled inline 865 AfterAttributeValue_quoted { read(Tokeniser t, CharacterReader r)866 @Override void read(Tokeniser t, CharacterReader r) { 867 char c = r.consume(); 868 switch (c) { 869 case '\t': 870 case '\n': 871 case '\r': 872 case '\f': 873 case ' ': 874 t.transition(BeforeAttributeName); 875 break; 876 case '/': 877 t.transition(SelfClosingStartTag); 878 break; 879 case '>': 880 t.emitTagPending(); 881 t.transition(Data); 882 break; 883 case eof: 884 t.eofError(this); 885 t.transition(Data); 886 break; 887 default: 888 r.unconsume(); 889 t.error(this); 890 t.transition(BeforeAttributeName); 891 } 892 893 } 894 }, 895 SelfClosingStartTag { read(Tokeniser t, CharacterReader r)896 @Override void read(Tokeniser t, CharacterReader r) { 897 char c = r.consume(); 898 switch (c) { 899 case '>': 900 t.tagPending.selfClosing = true; 901 t.emitTagPending(); 902 t.transition(Data); 903 break; 904 case eof: 905 t.eofError(this); 906 t.transition(Data); 907 break; 908 default: 909 r.unconsume(); 910 t.error(this); 911 t.transition(BeforeAttributeName); 912 } 913 } 914 }, 915 BogusComment { read(Tokeniser t, CharacterReader r)916 @Override void read(Tokeniser t, CharacterReader r) { 917 // todo: handle bogus comment starting from eof. when does that trigger? 918 t.commentPending.append(r.consumeTo('>')); 919 // todo: replace nullChar with replaceChar 920 char next = r.current(); 921 if (next == '>' || next == eof) { 922 r.consume(); 923 t.emitCommentPending(); 924 t.transition(Data); 925 } 926 } 927 }, 928 MarkupDeclarationOpen { read(Tokeniser t, CharacterReader r)929 @Override void read(Tokeniser t, CharacterReader r) { 930 if (r.matchConsume("--")) { 931 t.createCommentPending(); 932 t.transition(CommentStart); 933 } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { 934 t.transition(Doctype); 935 } else if (r.matchConsume("[CDATA[")) { 936 // todo: should actually check current namespace, and only non-html allows cdata. until namespace 937 // is implemented properly, keep handling as cdata 938 //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) { 939 t.createTempBuffer(); 940 t.transition(CdataSection); 941 } else { 942 t.error(this); 943 t.createBogusCommentPending(); 944 t.transition(BogusComment); 945 } 946 } 947 }, 948 CommentStart { read(Tokeniser t, CharacterReader r)949 @Override void read(Tokeniser t, CharacterReader r) { 950 char c = r.consume(); 951 switch (c) { 952 case '-': 953 t.transition(CommentStartDash); 954 break; 955 case nullChar: 956 t.error(this); 957 t.commentPending.append(replacementChar); 958 t.transition(Comment); 959 break; 960 case '>': 961 t.error(this); 962 t.emitCommentPending(); 963 t.transition(Data); 964 break; 965 case eof: 966 t.eofError(this); 967 t.emitCommentPending(); 968 t.transition(Data); 969 break; 970 default: 971 r.unconsume(); 972 t.transition(Comment); 973 } 974 } 975 }, 976 CommentStartDash { read(Tokeniser t, CharacterReader r)977 @Override void read(Tokeniser t, CharacterReader r) { 978 char c = r.consume(); 979 switch (c) { 980 case '-': 981 t.transition(CommentEnd); 982 break; 983 case nullChar: 984 t.error(this); 985 t.commentPending.append(replacementChar); 986 t.transition(Comment); 987 break; 988 case '>': 989 t.error(this); 990 t.emitCommentPending(); 991 t.transition(Data); 992 break; 993 case eof: 994 t.eofError(this); 995 t.emitCommentPending(); 996 t.transition(Data); 997 break; 998 default: 999 t.commentPending.append(c); 1000 t.transition(Comment); 1001 } 1002 } 1003 }, 1004 Comment { read(Tokeniser t, CharacterReader r)1005 @Override void read(Tokeniser t, CharacterReader r) { 1006 char c = r.current(); 1007 switch (c) { 1008 case '-': 1009 t.advanceTransition(CommentEndDash); 1010 break; 1011 case nullChar: 1012 t.error(this); 1013 r.advance(); 1014 t.commentPending.append(replacementChar); 1015 break; 1016 case eof: 1017 t.eofError(this); 1018 t.emitCommentPending(); 1019 t.transition(Data); 1020 break; 1021 default: 1022 t.commentPending.append(r.consumeToAny('-', nullChar)); 1023 } 1024 } 1025 }, 1026 CommentEndDash { read(Tokeniser t, CharacterReader r)1027 @Override void read(Tokeniser t, CharacterReader r) { 1028 char c = r.consume(); 1029 switch (c) { 1030 case '-': 1031 t.transition(CommentEnd); 1032 break; 1033 case nullChar: 1034 t.error(this); 1035 t.commentPending.append('-').append(replacementChar); 1036 t.transition(Comment); 1037 break; 1038 case eof: 1039 t.eofError(this); 1040 t.emitCommentPending(); 1041 t.transition(Data); 1042 break; 1043 default: 1044 t.commentPending.append('-').append(c); 1045 t.transition(Comment); 1046 } 1047 } 1048 }, 1049 CommentEnd { read(Tokeniser t, CharacterReader r)1050 @Override void read(Tokeniser t, CharacterReader r) { 1051 char c = r.consume(); 1052 switch (c) { 1053 case '>': 1054 t.emitCommentPending(); 1055 t.transition(Data); 1056 break; 1057 case nullChar: 1058 t.error(this); 1059 t.commentPending.append("--").append(replacementChar); 1060 t.transition(Comment); 1061 break; 1062 case '!': 1063 t.transition(CommentEndBang); 1064 break; 1065 case '-': 1066 t.commentPending.append('-'); 1067 break; 1068 case eof: 1069 t.eofError(this); 1070 t.emitCommentPending(); 1071 t.transition(Data); 1072 break; 1073 default: 1074 t.commentPending.append("--").append(c); 1075 t.transition(Comment); 1076 } 1077 } 1078 }, 1079 CommentEndBang { read(Tokeniser t, CharacterReader r)1080 @Override void read(Tokeniser t, CharacterReader r) { 1081 char c = r.consume(); 1082 switch (c) { 1083 case '-': 1084 t.commentPending.append("--!"); 1085 t.transition(CommentEndDash); 1086 break; 1087 case '>': 1088 t.emitCommentPending(); 1089 t.transition(Data); 1090 break; 1091 case nullChar: 1092 t.error(this); 1093 t.commentPending.append("--!").append(replacementChar); 1094 t.transition(Comment); 1095 break; 1096 case eof: 1097 t.eofError(this); 1098 t.emitCommentPending(); 1099 t.transition(Data); 1100 break; 1101 default: 1102 t.commentPending.append("--!").append(c); 1103 t.transition(Comment); 1104 } 1105 } 1106 }, 1107 Doctype { read(Tokeniser t, CharacterReader r)1108 @Override void read(Tokeniser t, CharacterReader r) { 1109 char c = r.consume(); 1110 switch (c) { 1111 case '\t': 1112 case '\n': 1113 case '\r': 1114 case '\f': 1115 case ' ': 1116 t.transition(BeforeDoctypeName); 1117 break; 1118 case eof: 1119 t.eofError(this); 1120 // note: fall through to > case 1121 case '>': // catch invalid <!DOCTYPE> 1122 t.error(this); 1123 t.createDoctypePending(); 1124 t.doctypePending.forceQuirks = true; 1125 t.emitDoctypePending(); 1126 t.transition(Data); 1127 break; 1128 default: 1129 t.error(this); 1130 t.transition(BeforeDoctypeName); 1131 } 1132 } 1133 }, 1134 BeforeDoctypeName { read(Tokeniser t, CharacterReader r)1135 @Override void read(Tokeniser t, CharacterReader r) { 1136 if (r.matchesAsciiAlpha()) { 1137 t.createDoctypePending(); 1138 t.transition(DoctypeName); 1139 return; 1140 } 1141 char c = r.consume(); 1142 switch (c) { 1143 case '\t': 1144 case '\n': 1145 case '\r': 1146 case '\f': 1147 case ' ': 1148 break; // ignore whitespace 1149 case nullChar: 1150 t.error(this); 1151 t.createDoctypePending(); 1152 t.doctypePending.name.append(replacementChar); 1153 t.transition(DoctypeName); 1154 break; 1155 case eof: 1156 t.eofError(this); 1157 t.createDoctypePending(); 1158 t.doctypePending.forceQuirks = true; 1159 t.emitDoctypePending(); 1160 t.transition(Data); 1161 break; 1162 default: 1163 t.createDoctypePending(); 1164 t.doctypePending.name.append(c); 1165 t.transition(DoctypeName); 1166 } 1167 } 1168 }, 1169 DoctypeName { read(Tokeniser t, CharacterReader r)1170 @Override void read(Tokeniser t, CharacterReader r) { 1171 if (r.matchesLetter()) { 1172 String name = r.consumeLetterSequence(); 1173 t.doctypePending.name.append(name); 1174 return; 1175 } 1176 char c = r.consume(); 1177 switch (c) { 1178 case '>': 1179 t.emitDoctypePending(); 1180 t.transition(Data); 1181 break; 1182 case '\t': 1183 case '\n': 1184 case '\r': 1185 case '\f': 1186 case ' ': 1187 t.transition(AfterDoctypeName); 1188 break; 1189 case nullChar: 1190 t.error(this); 1191 t.doctypePending.name.append(replacementChar); 1192 break; 1193 case eof: 1194 t.eofError(this); 1195 t.doctypePending.forceQuirks = true; 1196 t.emitDoctypePending(); 1197 t.transition(Data); 1198 break; 1199 default: 1200 t.doctypePending.name.append(c); 1201 } 1202 } 1203 }, 1204 AfterDoctypeName { read(Tokeniser t, CharacterReader r)1205 @Override void read(Tokeniser t, CharacterReader r) { 1206 if (r.isEmpty()) { 1207 t.eofError(this); 1208 t.doctypePending.forceQuirks = true; 1209 t.emitDoctypePending(); 1210 t.transition(Data); 1211 return; 1212 } 1213 if (r.matchesAny('\t', '\n', '\r', '\f', ' ')) 1214 r.advance(); // ignore whitespace 1215 else if (r.matches('>')) { 1216 t.emitDoctypePending(); 1217 t.advanceTransition(Data); 1218 } else if (r.matchConsumeIgnoreCase(DocumentType.PUBLIC_KEY)) { 1219 t.doctypePending.pubSysKey = DocumentType.PUBLIC_KEY; 1220 t.transition(AfterDoctypePublicKeyword); 1221 } else if (r.matchConsumeIgnoreCase(DocumentType.SYSTEM_KEY)) { 1222 t.doctypePending.pubSysKey = DocumentType.SYSTEM_KEY; 1223 t.transition(AfterDoctypeSystemKeyword); 1224 } else { 1225 t.error(this); 1226 t.doctypePending.forceQuirks = true; 1227 t.advanceTransition(BogusDoctype); 1228 } 1229 1230 } 1231 }, 1232 AfterDoctypePublicKeyword { read(Tokeniser t, CharacterReader r)1233 @Override void read(Tokeniser t, CharacterReader r) { 1234 char c = r.consume(); 1235 switch (c) { 1236 case '\t': 1237 case '\n': 1238 case '\r': 1239 case '\f': 1240 case ' ': 1241 t.transition(BeforeDoctypePublicIdentifier); 1242 break; 1243 case '"': 1244 t.error(this); 1245 // set public id to empty string 1246 t.transition(DoctypePublicIdentifier_doubleQuoted); 1247 break; 1248 case '\'': 1249 t.error(this); 1250 // set public id to empty string 1251 t.transition(DoctypePublicIdentifier_singleQuoted); 1252 break; 1253 case '>': 1254 t.error(this); 1255 t.doctypePending.forceQuirks = true; 1256 t.emitDoctypePending(); 1257 t.transition(Data); 1258 break; 1259 case eof: 1260 t.eofError(this); 1261 t.doctypePending.forceQuirks = true; 1262 t.emitDoctypePending(); 1263 t.transition(Data); 1264 break; 1265 default: 1266 t.error(this); 1267 t.doctypePending.forceQuirks = true; 1268 t.transition(BogusDoctype); 1269 } 1270 } 1271 }, 1272 BeforeDoctypePublicIdentifier { read(Tokeniser t, CharacterReader r)1273 @Override void read(Tokeniser t, CharacterReader r) { 1274 char c = r.consume(); 1275 switch (c) { 1276 case '\t': 1277 case '\n': 1278 case '\r': 1279 case '\f': 1280 case ' ': 1281 break; 1282 case '"': 1283 // set public id to empty string 1284 t.transition(DoctypePublicIdentifier_doubleQuoted); 1285 break; 1286 case '\'': 1287 // set public id to empty string 1288 t.transition(DoctypePublicIdentifier_singleQuoted); 1289 break; 1290 case '>': 1291 t.error(this); 1292 t.doctypePending.forceQuirks = true; 1293 t.emitDoctypePending(); 1294 t.transition(Data); 1295 break; 1296 case eof: 1297 t.eofError(this); 1298 t.doctypePending.forceQuirks = true; 1299 t.emitDoctypePending(); 1300 t.transition(Data); 1301 break; 1302 default: 1303 t.error(this); 1304 t.doctypePending.forceQuirks = true; 1305 t.transition(BogusDoctype); 1306 } 1307 } 1308 }, 1309 DoctypePublicIdentifier_doubleQuoted { read(Tokeniser t, CharacterReader r)1310 @Override void read(Tokeniser t, CharacterReader r) { 1311 char c = r.consume(); 1312 switch (c) { 1313 case '"': 1314 t.transition(AfterDoctypePublicIdentifier); 1315 break; 1316 case nullChar: 1317 t.error(this); 1318 t.doctypePending.publicIdentifier.append(replacementChar); 1319 break; 1320 case '>': 1321 t.error(this); 1322 t.doctypePending.forceQuirks = true; 1323 t.emitDoctypePending(); 1324 t.transition(Data); 1325 break; 1326 case eof: 1327 t.eofError(this); 1328 t.doctypePending.forceQuirks = true; 1329 t.emitDoctypePending(); 1330 t.transition(Data); 1331 break; 1332 default: 1333 t.doctypePending.publicIdentifier.append(c); 1334 } 1335 } 1336 }, 1337 DoctypePublicIdentifier_singleQuoted { read(Tokeniser t, CharacterReader r)1338 @Override void read(Tokeniser t, CharacterReader r) { 1339 char c = r.consume(); 1340 switch (c) { 1341 case '\'': 1342 t.transition(AfterDoctypePublicIdentifier); 1343 break; 1344 case nullChar: 1345 t.error(this); 1346 t.doctypePending.publicIdentifier.append(replacementChar); 1347 break; 1348 case '>': 1349 t.error(this); 1350 t.doctypePending.forceQuirks = true; 1351 t.emitDoctypePending(); 1352 t.transition(Data); 1353 break; 1354 case eof: 1355 t.eofError(this); 1356 t.doctypePending.forceQuirks = true; 1357 t.emitDoctypePending(); 1358 t.transition(Data); 1359 break; 1360 default: 1361 t.doctypePending.publicIdentifier.append(c); 1362 } 1363 } 1364 }, 1365 AfterDoctypePublicIdentifier { read(Tokeniser t, CharacterReader r)1366 @Override void read(Tokeniser t, CharacterReader r) { 1367 char c = r.consume(); 1368 switch (c) { 1369 case '\t': 1370 case '\n': 1371 case '\r': 1372 case '\f': 1373 case ' ': 1374 t.transition(BetweenDoctypePublicAndSystemIdentifiers); 1375 break; 1376 case '>': 1377 t.emitDoctypePending(); 1378 t.transition(Data); 1379 break; 1380 case '"': 1381 t.error(this); 1382 // system id empty 1383 t.transition(DoctypeSystemIdentifier_doubleQuoted); 1384 break; 1385 case '\'': 1386 t.error(this); 1387 // system id empty 1388 t.transition(DoctypeSystemIdentifier_singleQuoted); 1389 break; 1390 case eof: 1391 t.eofError(this); 1392 t.doctypePending.forceQuirks = true; 1393 t.emitDoctypePending(); 1394 t.transition(Data); 1395 break; 1396 default: 1397 t.error(this); 1398 t.doctypePending.forceQuirks = true; 1399 t.transition(BogusDoctype); 1400 } 1401 } 1402 }, 1403 BetweenDoctypePublicAndSystemIdentifiers { read(Tokeniser t, CharacterReader r)1404 @Override void read(Tokeniser t, CharacterReader r) { 1405 char c = r.consume(); 1406 switch (c) { 1407 case '\t': 1408 case '\n': 1409 case '\r': 1410 case '\f': 1411 case ' ': 1412 break; 1413 case '>': 1414 t.emitDoctypePending(); 1415 t.transition(Data); 1416 break; 1417 case '"': 1418 t.error(this); 1419 // system id empty 1420 t.transition(DoctypeSystemIdentifier_doubleQuoted); 1421 break; 1422 case '\'': 1423 t.error(this); 1424 // system id empty 1425 t.transition(DoctypeSystemIdentifier_singleQuoted); 1426 break; 1427 case eof: 1428 t.eofError(this); 1429 t.doctypePending.forceQuirks = true; 1430 t.emitDoctypePending(); 1431 t.transition(Data); 1432 break; 1433 default: 1434 t.error(this); 1435 t.doctypePending.forceQuirks = true; 1436 t.transition(BogusDoctype); 1437 } 1438 } 1439 }, 1440 AfterDoctypeSystemKeyword { read(Tokeniser t, CharacterReader r)1441 @Override void read(Tokeniser t, CharacterReader r) { 1442 char c = r.consume(); 1443 switch (c) { 1444 case '\t': 1445 case '\n': 1446 case '\r': 1447 case '\f': 1448 case ' ': 1449 t.transition(BeforeDoctypeSystemIdentifier); 1450 break; 1451 case '>': 1452 t.error(this); 1453 t.doctypePending.forceQuirks = true; 1454 t.emitDoctypePending(); 1455 t.transition(Data); 1456 break; 1457 case '"': 1458 t.error(this); 1459 // system id empty 1460 t.transition(DoctypeSystemIdentifier_doubleQuoted); 1461 break; 1462 case '\'': 1463 t.error(this); 1464 // system id empty 1465 t.transition(DoctypeSystemIdentifier_singleQuoted); 1466 break; 1467 case eof: 1468 t.eofError(this); 1469 t.doctypePending.forceQuirks = true; 1470 t.emitDoctypePending(); 1471 t.transition(Data); 1472 break; 1473 default: 1474 t.error(this); 1475 t.doctypePending.forceQuirks = true; 1476 t.emitDoctypePending(); 1477 } 1478 } 1479 }, 1480 BeforeDoctypeSystemIdentifier { read(Tokeniser t, CharacterReader r)1481 @Override void read(Tokeniser t, CharacterReader r) { 1482 char c = r.consume(); 1483 switch (c) { 1484 case '\t': 1485 case '\n': 1486 case '\r': 1487 case '\f': 1488 case ' ': 1489 break; 1490 case '"': 1491 // set system id to empty string 1492 t.transition(DoctypeSystemIdentifier_doubleQuoted); 1493 break; 1494 case '\'': 1495 // set public id to empty string 1496 t.transition(DoctypeSystemIdentifier_singleQuoted); 1497 break; 1498 case '>': 1499 t.error(this); 1500 t.doctypePending.forceQuirks = true; 1501 t.emitDoctypePending(); 1502 t.transition(Data); 1503 break; 1504 case eof: 1505 t.eofError(this); 1506 t.doctypePending.forceQuirks = true; 1507 t.emitDoctypePending(); 1508 t.transition(Data); 1509 break; 1510 default: 1511 t.error(this); 1512 t.doctypePending.forceQuirks = true; 1513 t.transition(BogusDoctype); 1514 } 1515 } 1516 }, 1517 DoctypeSystemIdentifier_doubleQuoted { read(Tokeniser t, CharacterReader r)1518 @Override void read(Tokeniser t, CharacterReader r) { 1519 char c = r.consume(); 1520 switch (c) { 1521 case '"': 1522 t.transition(AfterDoctypeSystemIdentifier); 1523 break; 1524 case nullChar: 1525 t.error(this); 1526 t.doctypePending.systemIdentifier.append(replacementChar); 1527 break; 1528 case '>': 1529 t.error(this); 1530 t.doctypePending.forceQuirks = true; 1531 t.emitDoctypePending(); 1532 t.transition(Data); 1533 break; 1534 case eof: 1535 t.eofError(this); 1536 t.doctypePending.forceQuirks = true; 1537 t.emitDoctypePending(); 1538 t.transition(Data); 1539 break; 1540 default: 1541 t.doctypePending.systemIdentifier.append(c); 1542 } 1543 } 1544 }, 1545 DoctypeSystemIdentifier_singleQuoted { read(Tokeniser t, CharacterReader r)1546 @Override void read(Tokeniser t, CharacterReader r) { 1547 char c = r.consume(); 1548 switch (c) { 1549 case '\'': 1550 t.transition(AfterDoctypeSystemIdentifier); 1551 break; 1552 case nullChar: 1553 t.error(this); 1554 t.doctypePending.systemIdentifier.append(replacementChar); 1555 break; 1556 case '>': 1557 t.error(this); 1558 t.doctypePending.forceQuirks = true; 1559 t.emitDoctypePending(); 1560 t.transition(Data); 1561 break; 1562 case eof: 1563 t.eofError(this); 1564 t.doctypePending.forceQuirks = true; 1565 t.emitDoctypePending(); 1566 t.transition(Data); 1567 break; 1568 default: 1569 t.doctypePending.systemIdentifier.append(c); 1570 } 1571 } 1572 }, 1573 AfterDoctypeSystemIdentifier { read(Tokeniser t, CharacterReader r)1574 @Override void read(Tokeniser t, CharacterReader r) { 1575 char c = r.consume(); 1576 switch (c) { 1577 case '\t': 1578 case '\n': 1579 case '\r': 1580 case '\f': 1581 case ' ': 1582 break; 1583 case '>': 1584 t.emitDoctypePending(); 1585 t.transition(Data); 1586 break; 1587 case eof: 1588 t.eofError(this); 1589 t.doctypePending.forceQuirks = true; 1590 t.emitDoctypePending(); 1591 t.transition(Data); 1592 break; 1593 default: 1594 t.error(this); 1595 t.transition(BogusDoctype); 1596 // NOT force quirks 1597 } 1598 } 1599 }, 1600 BogusDoctype { read(Tokeniser t, CharacterReader r)1601 @Override void read(Tokeniser t, CharacterReader r) { 1602 char c = r.consume(); 1603 switch (c) { 1604 case '>': 1605 t.emitDoctypePending(); 1606 t.transition(Data); 1607 break; 1608 case eof: 1609 t.emitDoctypePending(); 1610 t.transition(Data); 1611 break; 1612 default: 1613 // ignore char 1614 break; 1615 } 1616 } 1617 }, 1618 CdataSection { read(Tokeniser t, CharacterReader r)1619 @Override void read(Tokeniser t, CharacterReader r) { 1620 String data = r.consumeTo("]]>"); 1621 t.dataBuffer.append(data); 1622 if (r.matchConsume("]]>") || r.isEmpty()) { 1623 t.emit(new Token.CData(t.dataBuffer.toString())); 1624 t.transition(Data); 1625 }// otherwise, buffer underrun, stay in data section 1626 } 1627 }; 1628 1629 read(Tokeniser t, CharacterReader r)1630 abstract void read(Tokeniser t, CharacterReader r); 1631 1632 static final char nullChar = '\u0000'; 1633 // char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added. 1634 static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'}; 1635 static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'}; 1636 1637 private static final char replacementChar = Tokeniser.replacementChar; 1638 private static final String replacementStr = String.valueOf(Tokeniser.replacementChar); 1639 private static final char eof = CharacterReader.EOF; 1640 1641 /** 1642 * Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just 1643 * different else exit transitions. 1644 */ handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition)1645 private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) { 1646 if (r.matchesLetter()) { 1647 String name = r.consumeLetterSequence(); 1648 t.tagPending.appendTagName(name); 1649 t.dataBuffer.append(name); 1650 return; 1651 } 1652 1653 boolean needsExitTransition = false; 1654 if (t.isAppropriateEndTagToken() && !r.isEmpty()) { 1655 char c = r.consume(); 1656 switch (c) { 1657 case '\t': 1658 case '\n': 1659 case '\r': 1660 case '\f': 1661 case ' ': 1662 t.transition(BeforeAttributeName); 1663 break; 1664 case '/': 1665 t.transition(SelfClosingStartTag); 1666 break; 1667 case '>': 1668 t.emitTagPending(); 1669 t.transition(Data); 1670 break; 1671 default: 1672 t.dataBuffer.append(c); 1673 needsExitTransition = true; 1674 } 1675 } else { 1676 needsExitTransition = true; 1677 } 1678 1679 if (needsExitTransition) { 1680 t.emit("</"); 1681 t.emit(t.dataBuffer); 1682 t.transition(elseTransition); 1683 } 1684 } 1685 readRawData(Tokeniser t, CharacterReader r, TokeniserState current, TokeniserState advance)1686 private static void readRawData(Tokeniser t, CharacterReader r, TokeniserState current, TokeniserState advance) { 1687 switch (r.current()) { 1688 case '<': 1689 t.advanceTransition(advance); 1690 break; 1691 case nullChar: 1692 t.error(current); 1693 r.advance(); 1694 t.emit(replacementChar); 1695 break; 1696 case eof: 1697 t.emit(new Token.EOF()); 1698 break; 1699 default: 1700 String data = r.consumeRawData(); 1701 t.emit(data); 1702 break; 1703 } 1704 } 1705 readCharRef(Tokeniser t, TokeniserState advance)1706 private static void readCharRef(Tokeniser t, TokeniserState advance) { 1707 int[] c = t.consumeCharacterReference(null, false); 1708 if (c == null) 1709 t.emit('&'); 1710 else 1711 t.emit(c); 1712 t.transition(advance); 1713 } 1714 readEndTag(Tokeniser t, CharacterReader r, TokeniserState a, TokeniserState b)1715 private static void readEndTag(Tokeniser t, CharacterReader r, TokeniserState a, TokeniserState b) { 1716 if (r.matchesAsciiAlpha()) { 1717 t.createTagPending(false); 1718 t.transition(a); 1719 } else { 1720 t.emit("</"); 1721 t.transition(b); 1722 } 1723 } 1724 handleDataDoubleEscapeTag(Tokeniser t, CharacterReader r, TokeniserState primary, TokeniserState fallback)1725 private static void handleDataDoubleEscapeTag(Tokeniser t, CharacterReader r, TokeniserState primary, TokeniserState fallback) { 1726 if (r.matchesLetter()) { 1727 String name = r.consumeLetterSequence(); 1728 t.dataBuffer.append(name); 1729 t.emit(name); 1730 return; 1731 } 1732 1733 char c = r.consume(); 1734 switch (c) { 1735 case '\t': 1736 case '\n': 1737 case '\r': 1738 case '\f': 1739 case ' ': 1740 case '/': 1741 case '>': 1742 if (t.dataBuffer.toString().equals("script")) 1743 t.transition(primary); 1744 else 1745 t.transition(fallback); 1746 t.emit(c); 1747 break; 1748 default: 1749 r.unconsume(); 1750 t.transition(fallback); 1751 } 1752 } 1753 } 1754