1 // Copyright (c) 2011, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html; 30 31 import com.google.common.collect.ImmutableSet; 32 import com.google.common.collect.Lists; 33 import java.util.LinkedList; 34 import java.util.NoSuchElementException; 35 import java.util.Set; 36 37 import javax.annotation.concurrent.NotThreadSafe; 38 39 /** 40 * A flexible lexer for HTML. 41 * This is hairy code, but it is outside the TCB for the HTML sanitizer. 42 * 43 * @author Mike Samuel <mikesamuel@gmail.com> 44 */ 45 @NotThreadSafe 46 final class HtmlLexer extends AbstractTokenStream { 47 private final String input; 48 private final HtmlInputSplitter splitter; 49 private State state = State.OUTSIDE_TAG; 50 HtmlLexer(String input)51 public HtmlLexer(String input) { 52 this.input = input; 53 this.splitter = new HtmlInputSplitter(input); 54 } 55 56 /** 57 * Normalize case of names that are not name-spaced. This lower-cases HTML 58 * element and attribute names, but not ones for embedded SVG or MATHML. 59 */ canonicalName(String elementOrAttribName)60 static String canonicalName(String elementOrAttribName) { 61 return elementOrAttribName.indexOf(':') >= 0 62 ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName); 63 } 64 65 /** 66 * An FSM that lets us reclassify text tokens inside tags as attribute 67 * names/values 68 */ 69 private static enum State { 70 OUTSIDE_TAG, 71 IN_TAG, 72 SAW_NAME, 73 SAW_EQ, 74 ; 75 } 76 77 /** 78 * Makes sure that this.token contains a token if one is available. 79 * This may require fetching and combining multiple tokens from the underlying 80 * splitter. 81 */ 82 @Override produce()83 protected HtmlToken produce() { 84 HtmlToken token = readToken(); 85 if (token == null) { return null; } 86 87 switch (token.type) { 88 89 // Keep track of whether we're inside a tag or not. 90 case TAGBEGIN: 91 state = State.IN_TAG; 92 break; 93 case TAGEND: 94 if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) { 95 // Distinguish <input type=checkbox checked=> from 96 // <input type=checkbox checked> 97 pushbackToken(token); 98 state = State.IN_TAG; 99 return HtmlToken.instance( 100 token.start, token.start, HtmlTokenType.ATTRVALUE); 101 } 102 103 state = State.OUTSIDE_TAG; 104 break; 105 106 // Drop ignorable tokens by zeroing out the one received and recursing 107 case IGNORABLE: 108 return produce(); 109 110 // collapse adjacent text nodes if we're outside a tag, or otherwise, 111 // Recognize attribute names and values. 112 default: 113 switch (state) { 114 case OUTSIDE_TAG: 115 if (HtmlTokenType.TEXT == token.type 116 || HtmlTokenType.UNESCAPED == token.type) { 117 token = collapseSubsequent(token); 118 } 119 break; 120 case IN_TAG: 121 if (HtmlTokenType.TEXT == token.type 122 && !token.tokenInContextMatches(input, "=")) { 123 // Reclassify as attribute name 124 token = HtmlInputSplitter.reclassify( 125 token, HtmlTokenType.ATTRNAME); 126 state = State.SAW_NAME; 127 } 128 break; 129 case SAW_NAME: 130 if (HtmlTokenType.TEXT == token.type) { 131 if (token.tokenInContextMatches(input, "=")) { 132 state = State.SAW_EQ; 133 // Skip the '=' token 134 return produce(); 135 } else { 136 // Reclassify as attribute name 137 token = HtmlInputSplitter.reclassify( 138 token, HtmlTokenType.ATTRNAME); 139 } 140 } else { 141 state = State.IN_TAG; 142 } 143 break; 144 case SAW_EQ: 145 if (HtmlTokenType.TEXT == token.type 146 || HtmlTokenType.QSTRING == token.type) { 147 if (HtmlTokenType.TEXT == token.type) { 148 // Collapse adjacent text nodes to properly handle 149 // <a onclick=this.clicked=true> 150 // <a title=foo bar> 151 token = collapseAttributeName(token); 152 } 153 // Reclassify as value 154 token = HtmlInputSplitter.reclassify( 155 token, HtmlTokenType.ATTRVALUE); 156 state = State.IN_TAG; 157 } 158 break; 159 } 160 break; 161 } 162 163 return token; 164 } 165 166 /** 167 * Collapses all the following tokens of the same type into this.token. 168 */ collapseSubsequent(HtmlToken token)169 private HtmlToken collapseSubsequent(HtmlToken token) { 170 HtmlToken collapsed = token; 171 for (HtmlToken next; 172 (next= peekToken(0)) != null && next.type == token.type; 173 readToken()) { 174 collapsed = join(collapsed, next); 175 } 176 return collapsed; 177 } 178 collapseAttributeName(HtmlToken token)179 private HtmlToken collapseAttributeName(HtmlToken token) { 180 // We want to collapse tokens into the value that are not parts of an 181 // attribute value. We should include any space or text adjacent to the 182 // value, but should stop at any of the following constructions: 183 // space end-of-file e.g. name=foo_ 184 // space valueless-attrib-name e.g. name=foo checked 185 // space tag-end e.g. name=foo /> 186 // space text space? '=' e.g. name=foo bar= 187 int nToMerge = 0; 188 for (HtmlToken t; (t = peekToken(nToMerge)) != null;) { 189 if (t.type == HtmlTokenType.IGNORABLE) { 190 HtmlToken tok = peekToken(nToMerge + 1); 191 if (tok == null) { break; } 192 if (tok.type != HtmlTokenType.TEXT) { break; } 193 if (isValuelessAttribute(input.substring(tok.start, tok.end))) { 194 break; 195 } 196 HtmlToken eq = peekToken(nToMerge + 2); 197 if (eq != null && eq.type == HtmlTokenType.IGNORABLE) { 198 eq = peekToken(nToMerge + 3); 199 } 200 if (eq == null || eq.tokenInContextMatches(input, "=")) { 201 break; 202 } 203 } else if (t.type != HtmlTokenType.TEXT) { 204 break; 205 } 206 ++nToMerge; 207 } 208 if (nToMerge == 0) { return token; } 209 210 int end = token.end; 211 do { 212 end = readToken().end; 213 } while (--nToMerge > 0); 214 215 return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT); 216 } 217 join(HtmlToken a, HtmlToken b)218 private static HtmlToken join(HtmlToken a, HtmlToken b) { 219 return HtmlToken.instance(a.start, b.end, a.type); 220 } 221 222 private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList(); readToken()223 private HtmlToken readToken() { 224 if (!lookahead.isEmpty()) { 225 return lookahead.remove(); 226 } else if (splitter.hasNext()) { 227 return splitter.next(); 228 } else { 229 return null; 230 } 231 } 232 peekToken(int i)233 private HtmlToken peekToken(int i) { 234 while (lookahead.size() <= i && splitter.hasNext()) { 235 lookahead.add(splitter.next()); 236 } 237 return lookahead.size() > i ? lookahead.get(i) : null; 238 } 239 pushbackToken(HtmlToken token)240 private void pushbackToken(HtmlToken token) { 241 lookahead.addFirst(token); 242 } 243 244 /** Can the attribute appear in HTML without a value. */ isValuelessAttribute(String attribName)245 private static boolean isValuelessAttribute(String attribName) { 246 boolean valueless = VALUELESS_ATTRIB_NAMES.contains( 247 Strings.toLowerCase(attribName)); 248 return valueless; 249 } 250 251 // From http://issues.apache.org/jira/browse/XALANC-519 252 private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of( 253 "checked", "compact", "declare", "defer", "disabled", 254 "ismap", "multiple", "nohref", "noresize", "noshade", 255 "nowrap", "readonly", "selected"); 256 } 257 258 /** 259 * A token stream that breaks a character stream into <tt> 260 * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt> 261 * tokens. The matching of attribute names and values is done in a later step. 262 */ 263 final class HtmlInputSplitter extends AbstractTokenStream { 264 /** The source of HTML character data. */ 265 private final String input; 266 /** An offset into input. */ 267 private int offset; 268 /** True iff the current character is inside a tag. */ 269 private boolean inTag; 270 /** 271 * True if inside a script, xmp, listing, or similar tag whose content does 272 * not follow the normal escaping rules. 273 */ 274 private boolean inEscapeExemptBlock; 275 276 /** 277 * Null or the name of the close tag required to end the current escape exempt 278 * block. 279 * Preformatted tags include <script>, <xmp>, etc. that may 280 * contain unescaped HTML input. 281 */ 282 private String escapeExemptTagName = null; 283 284 private HtmlTextEscapingMode textEscapingMode; 285 HtmlInputSplitter(String input)286 public HtmlInputSplitter(String input) { 287 this.input = input; 288 } 289 290 /** 291 * Make sure that there is a token ready to yield in this.token. 292 */ 293 @Override produce()294 protected HtmlToken produce() { 295 HtmlToken token = parseToken(); 296 if (null == token) { return null; } 297 298 // Handle escape-exempt blocks. 299 // The parse() method is only dimly aware of escape-excempt blocks, so 300 // here we detect the beginning and ends of escape exempt blocks, and 301 // reclassify as UNESCAPED, any tokens that appear in the middle. 302 if (inEscapeExemptBlock) { 303 if (token.type != HtmlTokenType.SERVERCODE) { 304 // classify RCDATA as text since it can contain entities 305 token = reclassify( 306 token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA 307 ? HtmlTokenType.TEXT 308 : HtmlTokenType.UNESCAPED)); 309 } 310 } else { 311 switch (token.type) { 312 case TAGBEGIN: 313 { 314 String canonTagName = canonicalName( 315 token.start + 1, token.end); 316 if (HtmlTextEscapingMode.isTagFollowedByLiteralContent( 317 canonTagName)) { 318 this.escapeExemptTagName = canonTagName; 319 this.textEscapingMode = HtmlTextEscapingMode.getModeForTag( 320 canonTagName); 321 } 322 break; 323 } 324 case TAGEND: 325 this.inEscapeExemptBlock = null != this.escapeExemptTagName; 326 break; 327 default: 328 break; 329 } 330 } 331 return token; 332 } 333 334 /** 335 * States for a state machine for optimistically identifying tags and other 336 * html/xml/phpish structures. 337 */ 338 private static enum State { 339 TAGNAME, 340 SLASH, 341 BANG, 342 BANG_DASH, 343 COMMENT, 344 COMMENT_DASH, 345 COMMENT_DASH_DASH, 346 DIRECTIVE, 347 DONE, 348 BOGUS_COMMENT, 349 SERVER_CODE, 350 SERVER_CODE_PCT, 351 352 // From HTML 5 section 8.1.2.6 353 354 // The text in CDATA and RCDATA elements must not contain any 355 // occurrences of the string "</" followed by characters that 356 // case-insensitively match the tag name of the element followed 357 // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), 358 // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE, 359 // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless 360 // that string is part of an escaping text span. 361 362 // An escaping text span is a span of text (in CDATA and RCDATA 363 // elements) and character entity references (in RCDATA elements) 364 // that starts with an escaping text span start that is not itself 365 // in an escaping text span, and ends at the next escaping text 366 // span end. 367 368 // An escaping text span start is a part of text that consists of 369 // the four character sequence "<!--". 370 371 // An escaping text span end is a part of text that consists of 372 // the three character sequence "-->". 373 374 // An escaping text span start may share its U+002D HYPHEN-MINUS characters 375 // with its corresponding escaping text span end. 376 UNESCAPED_LT_BANG, // <! 377 UNESCAPED_LT_BANG_DASH, // <!- 378 ESCAPING_TEXT_SPAN, // Inside an escaping text span 379 ESCAPING_TEXT_SPAN_DASH, // Seen - inside an escaping text span 380 ESCAPING_TEXT_SPAN_DASH_DASH, // Seen -- inside an escaping text span 381 ; 382 } 383 384 private HtmlToken lastNonIgnorable = null; 385 /** 386 * Breaks the character stream into tokens. 387 * This method returns a stream of tokens such that each token starts where 388 * the last token ended. 389 * 390 * <p>This property is useful as it allows fetch to collapse and reclassify 391 * ranges of tokens based on state that is easy to maintain there. 392 * 393 * <p>Later passes are responsible for throwing away useless tokens. 394 */ parseToken()395 private HtmlToken parseToken() { 396 int start = offset; 397 int limit = input.length(); 398 if (start == limit) { return null; } 399 400 int end = start + 1; 401 HtmlTokenType type; 402 403 char ch = input.charAt(start); 404 if (inTag) { 405 if ('>' == ch) { 406 type = HtmlTokenType.TAGEND; 407 inTag = false; 408 } else if ('/' == ch) { 409 if (end != limit && '>' == input.charAt(end)) { 410 type = HtmlTokenType.TAGEND; 411 inTag = false; 412 ++end; 413 } else { 414 type = HtmlTokenType.TEXT; 415 } 416 } else if ('=' == ch) { 417 type = HtmlTokenType.TEXT; 418 } else if ('"' == ch || '\'' == ch) { 419 type = HtmlTokenType.QSTRING; 420 int delim = ch; 421 for (; end < limit; ++end) { 422 if (input.charAt(end) == delim) { 423 ++end; 424 break; 425 } 426 } 427 } else if (!Character.isWhitespace(ch)) { 428 type = HtmlTokenType.TEXT; 429 for (; end < limit; ++end) { 430 ch = input.charAt(end); 431 // End a text chunk before /> 432 if ((lastNonIgnorable == null 433 || !lastNonIgnorable.tokenInContextMatches(input, "=")) 434 && '/' == ch && end + 1 < limit 435 && '>' == input.charAt(end + 1)) { 436 break; 437 } else if ('>' == ch || '=' == ch 438 || Character.isWhitespace(ch)) { 439 break; 440 } else if ('"' == ch || '\'' == ch) { 441 if (end + 1 < limit) { 442 char ch2 = input.charAt(end + 1); 443 if (ch2 >= 0 && Character.isWhitespace(ch2) 444 || ch2 == '>' || ch2 == '/') { 445 ++end; 446 break; 447 } 448 } 449 } 450 } 451 } else { 452 // We skip whitespace tokens inside tag bodies. 453 type = HtmlTokenType.IGNORABLE; 454 while (end < limit && Character.isWhitespace(input.charAt(end))) { 455 ++end; 456 } 457 } 458 } else { 459 if (ch == '<') { 460 if (end == limit) { 461 type = HtmlTokenType.TEXT; 462 } else { 463 ch = input.charAt(end); 464 type = null; 465 State state = null; 466 switch (ch) { 467 case '/': // close tag? 468 state = State.SLASH; 469 ++end; 470 break; 471 case '!': // Comment or declaration 472 if (!this.inEscapeExemptBlock) { 473 state = State.BANG; 474 } else if (HtmlTextEscapingMode.allowsEscapingTextSpan( 475 escapeExemptTagName)) { 476 // Directives, and cdata suppressed in escape 477 // exempt mode as they could obscure the close of the 478 // escape exempty block, but comments are similar to escaping 479 // text spans, and are significant in all CDATA and RCDATA 480 // blocks except those inside <xmp> tags. 481 // See "Escaping text spans" in section 8.1.2.6 of HTML5. 482 // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions 483 state = State.UNESCAPED_LT_BANG; 484 } 485 ++end; 486 break; 487 case '?': 488 if (!this.inEscapeExemptBlock) { 489 state = State.BOGUS_COMMENT; 490 } 491 ++end; 492 break; 493 case '%': 494 state = State.SERVER_CODE; 495 ++end; 496 break; 497 default: 498 if (isIdentStart(ch) && !this.inEscapeExemptBlock) { 499 state = State.TAGNAME; 500 ++end; 501 } else if ('<' == ch) { 502 type = HtmlTokenType.TEXT; 503 } else { 504 ++end; 505 } 506 break; 507 } 508 if (null != state) { 509 charloop: 510 while (end < limit) { 511 ch = input.charAt(end); 512 switch (state) { 513 case TAGNAME: 514 if (Character.isWhitespace(ch) 515 || '>' == ch || '/' == ch || '<' == ch) { 516 // End processing of an escape exempt block when we see 517 // a corresponding end tag. 518 if (this.inEscapeExemptBlock 519 && '/' == input.charAt(start + 1) 520 && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT 521 && canonicalName(start + 2, end) 522 .equals(escapeExemptTagName)) { 523 this.inEscapeExemptBlock = false; 524 this.escapeExemptTagName = null; 525 this.textEscapingMode = null; 526 } 527 type = HtmlTokenType.TAGBEGIN; 528 // Don't process content as attributes if we're inside 529 // an escape exempt block. 530 inTag = !this.inEscapeExemptBlock; 531 state = State.DONE; 532 break charloop; 533 } 534 break; 535 case SLASH: 536 if (Character.isLetter(ch)) { 537 state = State.TAGNAME; 538 } else { 539 if ('<' == ch) { 540 type = HtmlTokenType.TEXT; 541 } else { 542 ++end; 543 } 544 break charloop; 545 } 546 break; 547 case BANG: 548 if ('-' == ch) { 549 state = State.BANG_DASH; 550 } else { 551 state = State.DIRECTIVE; 552 } 553 break; 554 case BANG_DASH: 555 if ('-' == ch) { 556 state = State.COMMENT; 557 } else { 558 state = State.DIRECTIVE; 559 } 560 break; 561 case COMMENT: 562 if ('-' == ch) { 563 state = State.COMMENT_DASH; 564 } 565 break; 566 case COMMENT_DASH: 567 state = ('-' == ch) 568 ? State.COMMENT_DASH_DASH 569 : State.COMMENT_DASH; 570 break; 571 case COMMENT_DASH_DASH: 572 if ('>' == ch) { 573 state = State.DONE; 574 type = HtmlTokenType.COMMENT; 575 } else if ('-' == ch) { 576 state = State.COMMENT_DASH_DASH; 577 } else { 578 state = State.COMMENT_DASH; 579 } 580 break; 581 case DIRECTIVE: 582 if ('>' == ch) { 583 type = HtmlTokenType.DIRECTIVE; 584 state = State.DONE; 585 } 586 break; 587 case BOGUS_COMMENT: 588 if ('>' == ch) { 589 type = HtmlTokenType.QMARKMETA; 590 state = State.DONE; 591 } 592 break; 593 case SERVER_CODE: 594 if ('%' == ch) { 595 state = State.SERVER_CODE_PCT; 596 } 597 break; 598 case SERVER_CODE_PCT: 599 if ('>' == ch) { 600 type = HtmlTokenType.SERVERCODE; 601 state = State.DONE; 602 } else if ('%' != ch) { 603 state = State.SERVER_CODE; 604 } 605 break; 606 case UNESCAPED_LT_BANG: 607 if ('-' == ch) { 608 state = State.UNESCAPED_LT_BANG_DASH; 609 } else { 610 type = HtmlTokenType.TEXT; 611 state = State.DONE; 612 } 613 break; 614 case UNESCAPED_LT_BANG_DASH: 615 if ('-' == ch) { 616 // According to HTML 5 section 8.1.2.6 617 618 // An escaping text span start may share its 619 // U+002D HYPHEN-MINUS characters with its 620 // corresponding escaping text span end. 621 state = State.ESCAPING_TEXT_SPAN_DASH_DASH; 622 } else { 623 type = HtmlTokenType.TEXT; 624 state = State.DONE; 625 } 626 break; 627 case ESCAPING_TEXT_SPAN: 628 if ('-' == ch) { 629 state = State.ESCAPING_TEXT_SPAN_DASH; 630 } 631 break; 632 case ESCAPING_TEXT_SPAN_DASH: 633 if ('-' == ch) { 634 state = State.ESCAPING_TEXT_SPAN_DASH_DASH; 635 } else { 636 state = State.ESCAPING_TEXT_SPAN; 637 } 638 break; 639 case ESCAPING_TEXT_SPAN_DASH_DASH: 640 if ('>' == ch) { 641 type = HtmlTokenType.TEXT; 642 state = State.DONE; 643 } else if ('-' != ch) { 644 state = State.ESCAPING_TEXT_SPAN; 645 } 646 break; 647 case DONE: 648 throw new AssertionError( 649 "Unexpectedly DONE while lexing HTML token stream"); 650 } 651 ++end; 652 if (State.DONE == state) { break; } 653 } 654 if (end == limit) { 655 switch (state) { 656 case DONE: 657 break; 658 case BOGUS_COMMENT: 659 type = HtmlTokenType.QMARKMETA; 660 break; 661 case COMMENT: 662 case COMMENT_DASH: 663 case COMMENT_DASH_DASH: 664 type = HtmlTokenType.COMMENT; 665 break; 666 case DIRECTIVE: 667 case SERVER_CODE: 668 case SERVER_CODE_PCT: 669 type = HtmlTokenType.SERVERCODE; 670 break; 671 case TAGNAME: 672 type = HtmlTokenType.TAGBEGIN; 673 break; 674 default: 675 type = HtmlTokenType.TEXT; 676 break; 677 } 678 } 679 } 680 } 681 } else { 682 type = null; 683 } 684 } 685 if (null == type) { 686 while (end < limit && '<' != input.charAt(end)) { ++end; } 687 type = HtmlTokenType.TEXT; 688 } 689 690 offset = end; 691 HtmlToken result = HtmlToken.instance(start, end, type); 692 if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; } 693 return result; 694 } 695 canonicalName(int start, int end)696 private String canonicalName(int start, int end) { 697 return HtmlLexer.canonicalName(input.substring(start, end)); 698 } 699 isIdentStart(char ch)700 private static boolean isIdentStart(char ch) { 701 return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a'); 702 } 703 reclassify(HtmlToken token, HtmlTokenType type)704 static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) { 705 return HtmlToken.instance(token.start, token.end, type); 706 } 707 } 708 709 710 /** 711 * A TokenStream that lazily fetches one token at a time. 712 * 713 * @author Mike Samuel <mikesamuel@gmail.com> 714 */ 715 abstract class AbstractTokenStream implements TokenStream { 716 private HtmlToken tok; 717 hasNext()718 public final boolean hasNext() { 719 if (tok == null) { tok = produce(); } 720 return tok != null; 721 } 722 next()723 public HtmlToken next() { 724 if (this.tok == null) { this.tok = produce(); } 725 HtmlToken t = this.tok; 726 if (t == null) { throw new NoSuchElementException(); } 727 this.tok = null; 728 return t; 729 } 730 produce()731 protected abstract HtmlToken produce(); 732 } 733