• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011, Mike Samuel
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 // Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // Neither the name of the OWASP nor the names of its contributors may
14 // be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 // POSSIBILITY OF SUCH DAMAGE.
28 
29 package org.owasp.html;
30 
31 import com.google.common.collect.ImmutableSet;
32 import com.google.common.collect.Lists;
33 import java.util.LinkedList;
34 import java.util.NoSuchElementException;
35 import java.util.Set;
36 
37 import javax.annotation.concurrent.NotThreadSafe;
38 
39 /**
40  * A flexible lexer for HTML.
41  * This is hairy code, but it is outside the TCB for the HTML sanitizer.
42  *
43  * @author Mike Samuel <mikesamuel@gmail.com>
44  */
45 @NotThreadSafe
46 final class HtmlLexer extends AbstractTokenStream {
47   private final String input;
48   private final HtmlInputSplitter splitter;
49   private State state = State.OUTSIDE_TAG;
50 
HtmlLexer(String input)51   public HtmlLexer(String input) {
52     this.input = input;
53     this.splitter = new HtmlInputSplitter(input);
54   }
55 
56   /**
57    * Normalize case of names that are not name-spaced.  This lower-cases HTML
58    * element and attribute names, but not ones for embedded SVG or MATHML.
59    */
canonicalName(String elementOrAttribName)60   static String canonicalName(String elementOrAttribName) {
61     return elementOrAttribName.indexOf(':') >= 0
62         ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
63   }
64 
65   /**
66    * An FSM that lets us reclassify text tokens inside tags as attribute
67    * names/values
68    */
69   private static enum State {
70     OUTSIDE_TAG,
71     IN_TAG,
72     SAW_NAME,
73     SAW_EQ,
74     ;
75   }
76 
77   /**
78    * Makes sure that this.token contains a token if one is available.
79    * This may require fetching and combining multiple tokens from the underlying
80    * splitter.
81    */
82   @Override
produce()83   protected HtmlToken produce() {
84     HtmlToken token = readToken();
85     if (token == null) { return null; }
86 
87     switch (token.type) {
88 
89       // Keep track of whether we're inside a tag or not.
90       case TAGBEGIN:
91         state = State.IN_TAG;
92         break;
93       case TAGEND:
94         if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
95           // Distinguish <input type=checkbox checked=> from
96           // <input type=checkbox checked>
97           pushbackToken(token);
98           state = State.IN_TAG;
99           return HtmlToken.instance(
100               token.start, token.start, HtmlTokenType.ATTRVALUE);
101         }
102 
103         state = State.OUTSIDE_TAG;
104         break;
105 
106       // Drop ignorable tokens by zeroing out the one received and recursing
107       case IGNORABLE:
108         return produce();
109 
110       // collapse adjacent text nodes if we're outside a tag, or otherwise,
111       // Recognize attribute names and values.
112       default:
113         switch (state) {
114           case OUTSIDE_TAG:
115             if (HtmlTokenType.TEXT == token.type
116                 || HtmlTokenType.UNESCAPED == token.type) {
117               token = collapseSubsequent(token);
118             }
119             break;
120           case IN_TAG:
121             if (HtmlTokenType.TEXT == token.type
122                 && !token.tokenInContextMatches(input, "=")) {
123               // Reclassify as attribute name
124               token = HtmlInputSplitter.reclassify(
125                   token, HtmlTokenType.ATTRNAME);
126               state = State.SAW_NAME;
127             }
128             break;
129           case SAW_NAME:
130             if (HtmlTokenType.TEXT == token.type) {
131               if (token.tokenInContextMatches(input, "=")) {
132                 state = State.SAW_EQ;
133                 // Skip the '=' token
134                 return produce();
135               } else {
136                 // Reclassify as attribute name
137                 token = HtmlInputSplitter.reclassify(
138                     token, HtmlTokenType.ATTRNAME);
139               }
140             } else {
141               state = State.IN_TAG;
142             }
143             break;
144           case SAW_EQ:
145             if (HtmlTokenType.TEXT == token.type
146                 || HtmlTokenType.QSTRING == token.type) {
147               if (HtmlTokenType.TEXT == token.type) {
148                 // Collapse adjacent text nodes to properly handle
149                 //   <a onclick=this.clicked=true>
150                 //   <a title=foo bar>
151                 token = collapseAttributeName(token);
152               }
153               // Reclassify as value
154               token = HtmlInputSplitter.reclassify(
155                   token, HtmlTokenType.ATTRVALUE);
156               state = State.IN_TAG;
157             }
158             break;
159         }
160         break;
161     }
162 
163     return token;
164   }
165 
166   /**
167    * Collapses all the following tokens of the same type into this.token.
168    */
collapseSubsequent(HtmlToken token)169   private HtmlToken collapseSubsequent(HtmlToken token) {
170     HtmlToken collapsed = token;
171     for (HtmlToken next;
172          (next= peekToken(0)) != null && next.type == token.type;
173          readToken()) {
174       collapsed = join(collapsed, next);
175     }
176     return collapsed;
177   }
178 
collapseAttributeName(HtmlToken token)179   private HtmlToken collapseAttributeName(HtmlToken token) {
180     // We want to collapse tokens into the value that are not parts of an
181     // attribute value.  We should include any space or text adjacent to the
182     // value, but should stop at any of the following constructions:
183     //   space end-of-file              e.g. name=foo_
184     //   space valueless-attrib-name    e.g. name=foo checked
185     //   space tag-end                  e.g. name=foo />
186     //   space text space? '='          e.g. name=foo bar=
187     int nToMerge = 0;
188     for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
189       if (t.type == HtmlTokenType.IGNORABLE) {
190         HtmlToken tok = peekToken(nToMerge + 1);
191         if (tok == null) { break; }
192         if (tok.type != HtmlTokenType.TEXT) { break; }
193         if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
194           break;
195         }
196         HtmlToken eq = peekToken(nToMerge + 2);
197         if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
198           eq = peekToken(nToMerge + 3);
199         }
200         if (eq == null || eq.tokenInContextMatches(input, "=")) {
201           break;
202         }
203       } else if (t.type != HtmlTokenType.TEXT) {
204         break;
205       }
206       ++nToMerge;
207     }
208     if (nToMerge == 0) { return token; }
209 
210     int end = token.end;
211     do {
212       end = readToken().end;
213     } while (--nToMerge > 0);
214 
215     return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
216   }
217 
join(HtmlToken a, HtmlToken b)218   private static HtmlToken join(HtmlToken a, HtmlToken b) {
219     return HtmlToken.instance(a.start, b.end, a.type);
220   }
221 
222   private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList();
readToken()223   private HtmlToken readToken() {
224     if (!lookahead.isEmpty()) {
225       return lookahead.remove();
226     } else if (splitter.hasNext()) {
227       return splitter.next();
228     } else {
229       return null;
230     }
231   }
232 
peekToken(int i)233   private HtmlToken peekToken(int i) {
234     while (lookahead.size() <= i && splitter.hasNext()) {
235       lookahead.add(splitter.next());
236     }
237     return lookahead.size() > i ? lookahead.get(i) : null;
238   }
239 
pushbackToken(HtmlToken token)240   private void pushbackToken(HtmlToken token) {
241     lookahead.addFirst(token);
242   }
243 
244   /** Can the attribute appear in HTML without a value. */
isValuelessAttribute(String attribName)245   private static boolean isValuelessAttribute(String attribName) {
246     boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
247         Strings.toLowerCase(attribName));
248     return valueless;
249   }
250 
251   // From http://issues.apache.org/jira/browse/XALANC-519
252   private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
253       "checked", "compact", "declare", "defer", "disabled",
254       "ismap", "multiple", "nohref", "noresize", "noshade",
255       "nowrap", "readonly", "selected");
256 }
257 
258 /**
259  * A token stream that breaks a character stream into <tt>
260  * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
261  * tokens.  The matching of attribute names and values is done in a later step.
262  */
263 final class HtmlInputSplitter extends AbstractTokenStream {
264   /** The source of HTML character data. */
265   private final String input;
266   /** An offset into input. */
267   private int offset;
268   /** True iff the current character is inside a tag. */
269   private boolean inTag;
270   /**
271    * True if inside a script, xmp, listing, or similar tag whose content does
272    * not follow the normal escaping rules.
273    */
274   private boolean inEscapeExemptBlock;
275 
276   /**
277    * Null or the name of the close tag required to end the current escape exempt
278    * block.
279    * Preformatted tags include &lt;script&gt;, &lt;xmp&gt;, etc. that may
280    * contain unescaped HTML input.
281    */
282   private String escapeExemptTagName = null;
283 
284   private HtmlTextEscapingMode textEscapingMode;
285 
HtmlInputSplitter(String input)286   public HtmlInputSplitter(String input) {
287     this.input = input;
288   }
289 
290   /**
291    * Make sure that there is a token ready to yield in this.token.
292    */
293   @Override
produce()294   protected HtmlToken produce() {
295     HtmlToken token = parseToken();
296     if (null == token) { return null; }
297 
298     // Handle escape-exempt blocks.
299     // The parse() method is only dimly aware of escape-excempt blocks, so
300     // here we detect the beginning and ends of escape exempt blocks, and
301     // reclassify as UNESCAPED, any tokens that appear in the middle.
302     if (inEscapeExemptBlock) {
303       if (token.type != HtmlTokenType.SERVERCODE) {
304         // classify RCDATA as text since it can contain entities
305         token = reclassify(
306             token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
307                     ? HtmlTokenType.TEXT
308                     : HtmlTokenType.UNESCAPED));
309       }
310     } else {
311       switch (token.type) {
312         case TAGBEGIN:
313           {
314             String canonTagName = canonicalName(
315                 token.start + 1, token.end);
316             if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
317                     canonTagName)) {
318               this.escapeExemptTagName = canonTagName;
319               this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
320                   canonTagName);
321             }
322             break;
323           }
324         case TAGEND:
325           this.inEscapeExemptBlock = null != this.escapeExemptTagName;
326           break;
327         default:
328           break;
329       }
330     }
331     return token;
332   }
333 
334   /**
335    * States for a state machine for optimistically identifying tags and other
336    * html/xml/phpish structures.
337    */
338   private static enum State {
339     TAGNAME,
340     SLASH,
341     BANG,
342     BANG_DASH,
343     COMMENT,
344     COMMENT_DASH,
345     COMMENT_DASH_DASH,
346     DIRECTIVE,
347     DONE,
348     BOGUS_COMMENT,
349     SERVER_CODE,
350     SERVER_CODE_PCT,
351 
352     // From HTML 5 section 8.1.2.6
353 
354     // The text in CDATA and RCDATA elements must not contain any
355     // occurrences of the string "</" followed by characters that
356     // case-insensitively match the tag name of the element followed
357     // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
358     // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
359     // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
360     // that string is part of an escaping text span.
361 
362     // An escaping text span is a span of text (in CDATA and RCDATA
363     // elements) and character entity references (in RCDATA elements)
364     // that starts with an escaping text span start that is not itself
365     // in an escaping text span, and ends at the next escaping text
366     // span end.
367 
368     // An escaping text span start is a part of text that consists of
369     // the four character sequence "<!--".
370 
371     // An escaping text span end is a part of text that consists of
372     // the three character sequence "-->".
373 
374     // An escaping text span start may share its U+002D HYPHEN-MINUS characters
375     // with its corresponding escaping text span end.
376     UNESCAPED_LT_BANG,             // <!
377     UNESCAPED_LT_BANG_DASH,        // <!-
378     ESCAPING_TEXT_SPAN,            // Inside an escaping text span
379     ESCAPING_TEXT_SPAN_DASH,       // Seen - inside an escaping text span
380     ESCAPING_TEXT_SPAN_DASH_DASH,  // Seen -- inside an escaping text span
381     ;
382   }
383 
384   private HtmlToken lastNonIgnorable = null;
385   /**
386    * Breaks the character stream into tokens.
387    * This method returns a stream of tokens such that each token starts where
388    * the last token ended.
389    *
390    * <p>This property is useful as it allows fetch to collapse and reclassify
391    * ranges of tokens based on state that is easy to maintain there.
392    *
393    * <p>Later passes are responsible for throwing away useless tokens.
394    */
parseToken()395   private HtmlToken parseToken() {
396     int start = offset;
397     int limit = input.length();
398     if (start == limit) { return null; }
399 
400     int end = start + 1;
401     HtmlTokenType type;
402 
403     char ch = input.charAt(start);
404     if (inTag) {
405       if ('>' == ch) {
406         type = HtmlTokenType.TAGEND;
407         inTag = false;
408       } else if ('/' == ch) {
409         if (end != limit && '>' == input.charAt(end)) {
410           type = HtmlTokenType.TAGEND;
411           inTag = false;
412           ++end;
413         } else {
414           type = HtmlTokenType.TEXT;
415         }
416       } else if ('=' == ch) {
417         type = HtmlTokenType.TEXT;
418       } else if ('"' == ch || '\'' == ch) {
419         type = HtmlTokenType.QSTRING;
420         int delim = ch;
421         for (; end < limit; ++end) {
422           if (input.charAt(end) == delim) {
423             ++end;
424             break;
425           }
426         }
427       } else if (!Character.isWhitespace(ch)) {
428         type = HtmlTokenType.TEXT;
429         for (; end < limit; ++end) {
430           ch = input.charAt(end);
431           // End a text chunk before />
432           if ((lastNonIgnorable == null
433                || !lastNonIgnorable.tokenInContextMatches(input, "="))
434               && '/' == ch && end + 1 < limit
435               && '>' == input.charAt(end + 1)) {
436             break;
437           } else if ('>' == ch || '=' == ch
438                      || Character.isWhitespace(ch)) {
439             break;
440           } else if ('"' == ch || '\'' == ch) {
441             if (end + 1 < limit) {
442               char ch2 = input.charAt(end + 1);
443               if (ch2 >= 0 && Character.isWhitespace(ch2)
444                   || ch2 == '>' || ch2 == '/') {
445                 ++end;
446                 break;
447               }
448             }
449           }
450         }
451       } else {
452         // We skip whitespace tokens inside tag bodies.
453         type = HtmlTokenType.IGNORABLE;
454         while (end < limit && Character.isWhitespace(input.charAt(end))) {
455           ++end;
456         }
457       }
458     } else {
459       if (ch == '<') {
460         if (end == limit) {
461           type = HtmlTokenType.TEXT;
462         } else {
463           ch = input.charAt(end);
464           type = null;
465           State state = null;
466           switch (ch) {
467             case '/':  // close tag?
468               state = State.SLASH;
469               ++end;
470               break;
471             case '!':  // Comment or declaration
472               if (!this.inEscapeExemptBlock) {
473                 state = State.BANG;
474               } else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
475                              escapeExemptTagName)) {
476                 // Directives, and cdata suppressed in escape
477                 // exempt mode as they could obscure the close of the
478                 // escape exempty block, but comments are similar to escaping
479                 // text spans, and are significant in all CDATA and RCDATA
480                 // blocks except those inside <xmp> tags.
481                 // See "Escaping text spans" in section 8.1.2.6 of HTML5.
482                 // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
483                 state = State.UNESCAPED_LT_BANG;
484               }
485               ++end;
486               break;
487             case '?':
488               if (!this.inEscapeExemptBlock) {
489                 state = State.BOGUS_COMMENT;
490               }
491               ++end;
492               break;
493             case '%':
494               state = State.SERVER_CODE;
495               ++end;
496               break;
497             default:
498               if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
499                 state = State.TAGNAME;
500                 ++end;
501               } else if ('<' == ch) {
502                 type = HtmlTokenType.TEXT;
503               } else {
504                 ++end;
505               }
506               break;
507           }
508           if (null != state) {
509             charloop:
510             while (end < limit) {
511               ch = input.charAt(end);
512               switch (state) {
513                 case TAGNAME:
514                   if (Character.isWhitespace(ch)
515                       || '>' == ch || '/' == ch || '<' == ch) {
516                     // End processing of an escape exempt block when we see
517                     // a corresponding end tag.
518                     if (this.inEscapeExemptBlock
519                         && '/' == input.charAt(start + 1)
520                         && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
521                         && canonicalName(start + 2, end)
522                             .equals(escapeExemptTagName)) {
523                       this.inEscapeExemptBlock = false;
524                       this.escapeExemptTagName = null;
525                       this.textEscapingMode = null;
526                     }
527                     type = HtmlTokenType.TAGBEGIN;
528                     // Don't process content as attributes if we're inside
529                     // an escape exempt block.
530                     inTag = !this.inEscapeExemptBlock;
531                     state = State.DONE;
532                     break charloop;
533                   }
534                   break;
535                 case SLASH:
536                   if (Character.isLetter(ch)) {
537                     state = State.TAGNAME;
538                   } else {
539                     if ('<' == ch) {
540                       type = HtmlTokenType.TEXT;
541                     } else {
542                       ++end;
543                     }
544                     break charloop;
545                   }
546                   break;
547                 case BANG:
548                   if ('-' == ch) {
549                     state = State.BANG_DASH;
550                   } else {
551                     state = State.DIRECTIVE;
552                   }
553                   break;
554                 case BANG_DASH:
555                   if ('-' == ch) {
556                     state = State.COMMENT;
557                   } else {
558                     state = State.DIRECTIVE;
559                   }
560                   break;
561                 case COMMENT:
562                   if ('-' == ch) {
563                     state = State.COMMENT_DASH;
564                   }
565                   break;
566                 case COMMENT_DASH:
567                   state = ('-' == ch)
568                       ? State.COMMENT_DASH_DASH
569                       : State.COMMENT_DASH;
570                   break;
571                 case COMMENT_DASH_DASH:
572                   if ('>' == ch) {
573                     state = State.DONE;
574                     type = HtmlTokenType.COMMENT;
575                   } else if ('-' == ch) {
576                     state = State.COMMENT_DASH_DASH;
577                   } else {
578                     state = State.COMMENT_DASH;
579                   }
580                   break;
581                 case DIRECTIVE:
582                   if ('>' == ch) {
583                     type = HtmlTokenType.DIRECTIVE;
584                     state = State.DONE;
585                   }
586                   break;
587                 case BOGUS_COMMENT:
588                   if ('>' == ch) {
589                     type = HtmlTokenType.QMARKMETA;
590                     state = State.DONE;
591                   }
592                   break;
593                 case SERVER_CODE:
594                   if ('%' == ch) {
595                     state = State.SERVER_CODE_PCT;
596                   }
597                   break;
598                 case SERVER_CODE_PCT:
599                   if ('>' == ch) {
600                     type = HtmlTokenType.SERVERCODE;
601                     state = State.DONE;
602                   } else if ('%' != ch) {
603                     state = State.SERVER_CODE;
604                   }
605                   break;
606                 case UNESCAPED_LT_BANG:
607                   if ('-' == ch) {
608                     state = State.UNESCAPED_LT_BANG_DASH;
609                   } else {
610                     type = HtmlTokenType.TEXT;
611                     state = State.DONE;
612                   }
613                   break;
614                 case UNESCAPED_LT_BANG_DASH:
615                   if ('-' == ch) {
616                     // According to HTML 5 section 8.1.2.6
617 
618                     // An escaping text span start may share its
619                     // U+002D HYPHEN-MINUS characters with its
620                     // corresponding escaping text span end.
621                     state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
622                   } else {
623                     type = HtmlTokenType.TEXT;
624                     state = State.DONE;
625                   }
626                   break;
627                 case ESCAPING_TEXT_SPAN:
628                   if ('-' == ch) {
629                     state = State.ESCAPING_TEXT_SPAN_DASH;
630                   }
631                   break;
632                 case ESCAPING_TEXT_SPAN_DASH:
633                   if ('-' == ch) {
634                     state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
635                   } else {
636                     state = State.ESCAPING_TEXT_SPAN;
637                   }
638                   break;
639                 case ESCAPING_TEXT_SPAN_DASH_DASH:
640                   if ('>' == ch) {
641                     type = HtmlTokenType.TEXT;
642                     state = State.DONE;
643                   } else if ('-' != ch) {
644                     state = State.ESCAPING_TEXT_SPAN;
645                   }
646                   break;
647                 case DONE:
648                   throw new AssertionError(
649                       "Unexpectedly DONE while lexing HTML token stream");
650               }
651               ++end;
652               if (State.DONE == state) { break; }
653             }
654             if (end == limit) {
655               switch (state) {
656                 case DONE:
657                   break;
658                 case BOGUS_COMMENT:
659                   type = HtmlTokenType.QMARKMETA;
660                   break;
661                 case COMMENT:
662                 case COMMENT_DASH:
663                 case COMMENT_DASH_DASH:
664                   type = HtmlTokenType.COMMENT;
665                   break;
666                 case DIRECTIVE:
667                 case SERVER_CODE:
668                 case SERVER_CODE_PCT:
669                   type = HtmlTokenType.SERVERCODE;
670                   break;
671                 case TAGNAME:
672                   type = HtmlTokenType.TAGBEGIN;
673                   break;
674                 default:
675                   type = HtmlTokenType.TEXT;
676                   break;
677               }
678             }
679           }
680         }
681       } else {
682         type = null;
683       }
684     }
685     if (null == type) {
686       while (end < limit && '<' != input.charAt(end)) { ++end; }
687       type = HtmlTokenType.TEXT;
688     }
689 
690     offset = end;
691     HtmlToken result = HtmlToken.instance(start, end, type);
692     if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; }
693     return result;
694   }
695 
canonicalName(int start, int end)696   private String canonicalName(int start, int end) {
697     return HtmlLexer.canonicalName(input.substring(start, end));
698   }
699 
isIdentStart(char ch)700   private static boolean isIdentStart(char ch) {
701     return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
702   }
703 
reclassify(HtmlToken token, HtmlTokenType type)704   static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) {
705     return HtmlToken.instance(token.start, token.end, type);
706   }
707 }
708 
709 
710 /**
711  * A TokenStream that lazily fetches one token at a time.
712  *
713  * @author Mike Samuel <mikesamuel@gmail.com>
714  */
715 abstract class AbstractTokenStream implements TokenStream {
716   private HtmlToken tok;
717 
hasNext()718   public final boolean hasNext() {
719     if (tok == null) { tok = produce(); }
720     return tok != null;
721   }
722 
next()723   public HtmlToken next() {
724     if (this.tok == null) { this.tok = produce(); }
725     HtmlToken t = this.tok;
726     if (t == null) { throw new NoSuchElementException(); }
727     this.tok = null;
728     return t;
729   }
730 
produce()731   protected abstract HtmlToken produce();
732 }
733