1 /** 2 * Copyright (c) 2004, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.android.mail.lib.html.parser; 17 18 import com.android.mail.lib.base.CharEscapers; 19 import com.android.mail.lib.base.CharMatcher; 20 import com.android.mail.lib.base.Preconditions; 21 import com.android.mail.lib.base.StringUtil; 22 import com.android.mail.lib.base.X; 23 import com.google.common.collect.Lists; 24 import com.google.common.collect.Maps; 25 import com.google.common.io.ByteStreams; 26 27 import java.io.IOException; 28 import java.util.ArrayList; 29 import java.util.HashMap; 30 import java.util.LinkedList; 31 import java.util.List; 32 import java.util.ListIterator; 33 import java.util.regex.Matcher; 34 import java.util.regex.Pattern; 35 36 /** 37 * HtmlParser is a simple but efficient html parser. 38 * - It's simple because it does not do incremental parsing like some other 39 * parser. It assumes that the entire html text is available. 40 * - It offers 3 levels of aggressiveness in correcting errors in HTML (see 41 * HtmlParser.ParseStyle). 42 * - HTML comments are ignored unless initialized with ParseStyle.PRESERVE_ALL. 43 */ 44 public class HtmlParser { 45 46 // States 47 private enum State { 48 IN_TEXT, IN_TAG, IN_COMMENT, IN_CDATA 49 } 50 51 // The current state 52 private State state; 53 54 private int clipLength = Integer.MAX_VALUE; 55 private boolean clipped; 56 57 // The html text 58 private String html; 59 60 // The entire array of nodes 61 private List<HtmlDocument.Node> nodes; 62 63 // Turn on for debug information. 64 private static boolean DEBUG = false; 65 66 // Default whitelist 67 public static final HtmlWhitelist DEFAULT_WHITELIST = HTML4.getWhitelist(); 68 69 // Whitelists for looking up accepted HTML tags and attributes 70 private List<HtmlWhitelist> whitelists = Lists.newArrayList(DEFAULT_WHITELIST); 71 72 /** 73 * This setting controls how much of the original HTML is preserved. In 74 * ascending order of aggressiveness: 75 * - PRESERVE_ALL: Preserves all of original content. 76 * *** Warning - PRESERVE_ALL mode preserves invalid and unsafe HTML. *** 77 * - PRESERVE_VALID: Preserves only valid visible HTML handled by 78 * most browsers. Discards comments, unknown tags and attributes, and 79 * nameless end tags. Encodes all '<' characters that aren't part of a tag. 80 * - NORMALIZE: In addition to the changes made by PRESERVE_VALID, also 81 * - unescapes then reescapes text to normalize entities 82 * - normalizes whitespace and quotes around tag attributes 83 */ 84 public enum ParseStyle { NORMALIZE, PRESERVE_VALID, PRESERVE_ALL } 85 86 /** 87 * True only in PRESERVE_ALL mode. 88 * @see HtmlParser.ParseStyle 89 */ 90 private final boolean preserveAll; 91 92 /** 93 * True in either PRESERVE_ALL or PRESERVE_VALID mode. 94 * @see HtmlParser.ParseStyle 95 */ 96 private final boolean preserveValidHtml; 97 98 /** 99 * @see HtmlParser#HtmlParser(HtmlParser.ParseStyle) 100 */ HtmlParser()101 public HtmlParser() { 102 this(ParseStyle.NORMALIZE); 103 } 104 105 /** 106 * @param parseStyle Level of aggressiveness for how different 107 * toHTML()/toXHTML() are from original. 108 * @see HtmlParser.ParseStyle 109 */ HtmlParser(ParseStyle parseStyle)110 public HtmlParser(ParseStyle parseStyle) { 111 preserveAll = (parseStyle == ParseStyle.PRESERVE_ALL); 112 preserveValidHtml = preserveAll || (parseStyle == ParseStyle.PRESERVE_VALID); 113 } 114 115 /** 116 * Sets the maximum length, in characters, of an HTML message. 117 * 118 * @param clipLength must be greater than zero. 119 * (It starts as Integer.MAX_VALUE) 120 */ setClipLength(int clipLength)121 public void setClipLength(int clipLength) { 122 if (clipLength <= 0) { 123 throw new IllegalArgumentException( 124 "clipLength '" + clipLength + "' <= 0"); 125 } 126 this.clipLength = clipLength; 127 } 128 isClipped()129 public boolean isClipped() { 130 return clipped; 131 } 132 133 /** 134 * Sets the HTML whitelist. Calling this overrides any whitelist(s) that 135 * the parser is configured to use. By default, the parser uses the standard 136 * HTML4 whitelist. 137 * 138 * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode. 139 * 140 * @param whitelist The whitelist to use. Must not be null. 141 */ setWhitelist(HtmlWhitelist whitelist)142 public void setWhitelist(HtmlWhitelist whitelist) { 143 Preconditions.checkNotNull(whitelist); 144 whitelists = Lists.newArrayList(whitelist); 145 } 146 147 /** 148 * Adds an HTML whitelist to the list of whitelists consulted when 149 * processing an element or attribute. By default, the parser only uses 150 * the standard HTML4 whitelist. 151 * 152 * Whitelists are consulted in reverse chronological order (starting from 153 * the most recently added whitelist). The last whitelist consulted will 154 * always be the standard HTML4 whitelist, unless this was overridden by 155 * a call to {@link #setWhitelist}. 156 * 157 * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode. 158 * 159 * @param whitelist The whitelist to use. 160 */ addWhitelist(HtmlWhitelist whitelist)161 public void addWhitelist(HtmlWhitelist whitelist) { 162 whitelists.add(whitelist); 163 } 164 165 /** 166 * These are characters that we don't want to allow unquoted in an attribute 167 * value because they might be interpreted by the browser as HTML control 168 * characters. They are the 5 characters that are escaped by 169 * com.google.common.base.CharEscapers.HTML_ESCAPE, plus '=' and whitespace. 170 * Note that it shouldn't be possible for '>' or whitespace to be parsed as 171 * part of an unquoted attribute value, but we leave them here for 172 * completeness. 173 * Package visibility for unit tests. 174 */ 175 static Pattern NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX = Pattern.compile("[\"\'&<>=\\s]"); 176 177 //------------------------------------------------------------------------ 178 // Parsing 179 //------------------------------------------------------------------------ 180 181 /** 182 * Parses a String as HTML. 183 * 184 * @param html String to parse 185 * @return an Html document 186 */ parse(String html)187 public HtmlDocument parse(String html) { 188 this.html = html; 189 // Use a LinkedList because we don't know the number of nodes ahead of 190 // time. This will be compacted into an ArrayList in coalesceTextNodes(). 191 nodes = Lists.newLinkedList(); 192 state = State.IN_TEXT; 193 194 clipped = false; 195 int end = html.length(); 196 int clipEnd = Math.min(clipLength, end); 197 198 for (int i = 0; i < end && !clipped;) { 199 200 // At any one time, the parser is in one of these states: 201 int pos; 202 switch (state) { 203 case IN_TEXT: 204 // text will not attempt to parse beyond the clipping length 205 pos = scanText(i, clipEnd); 206 X.assertTrue(pos > i || state != State.IN_TEXT); // Must make progress. 207 break; 208 209 case IN_TAG: 210 pos = scanTag(i, end); 211 X.assertTrue(pos > i); // Must make progress 212 break; 213 214 case IN_COMMENT: 215 pos = scanComment(i, end); 216 state = State.IN_TEXT; 217 X.assertTrue(pos > i); // Must make progress 218 break; 219 220 case IN_CDATA: 221 pos = scanCDATA(i, end); 222 X.assertTrue(pos > i || state != State.IN_CDATA); // Must make progress 223 break; 224 225 default: 226 throw new Error("Unknown state!"); 227 } 228 229 i = pos; 230 231 // If we've reached or gone beyond the clipping length, stop. 232 clipped = pos >= clipLength; 233 } 234 235 nodes = coalesceTextNodes(nodes); 236 237 HtmlDocument doc = new HtmlDocument(nodes); 238 nodes = null; 239 html = null; 240 return doc; 241 } 242 243 /** 244 * During the course of parsing, we may have multiple adjacent Text nodes, 245 * due to the sanitizer stripping out nodes between Text nodes. It is 246 * important to coalesce them so that later steps in the pipeline can 247 * treat the text as a single block (e.g. the step that inserts <wbr> tags). 248 * @param nodes Original nodes. 249 * @return Nodes with text nodes changed. 250 */ coalesceTextNodes( List<HtmlDocument.Node> nodes)251 static List<HtmlDocument.Node> coalesceTextNodes( 252 List<HtmlDocument.Node> nodes) { 253 List<HtmlDocument.Node> out = 254 new ArrayList<HtmlDocument.Node>(nodes.size()); 255 LinkedList<HtmlDocument.Text> textNodes = Lists.newLinkedList(); 256 257 for (HtmlDocument.Node node : nodes) { 258 if (node instanceof HtmlDocument.Text) { 259 textNodes.add((HtmlDocument.Text) node); 260 } else { 261 mergeTextNodes(textNodes, out); 262 out.add(node); 263 } 264 } 265 mergeTextNodes(textNodes, out); 266 return out; 267 } 268 269 /** 270 * Flushes any Text nodes in {@code textNodes} into a single Text node 271 * in {@code output}. {@code textNodes} is guaranteed to be empty when 272 * the function returns. 273 * @param textNodes Text nodes. 274 * @param output Destination to which results are added. 275 */ mergeTextNodes(LinkedList<HtmlDocument.Text> textNodes, List<HtmlDocument.Node> output)276 private static void mergeTextNodes(LinkedList<HtmlDocument.Text> textNodes, 277 List<HtmlDocument.Node> output) { 278 if (!textNodes.isEmpty()) { 279 if (textNodes.size() == 1) { 280 output.add(textNodes.removeFirst()); 281 } else { 282 int combinedTextLen = 0; 283 int combinedInputLen = 0; 284 for (HtmlDocument.Text text : textNodes) { 285 combinedTextLen += text.getText().length(); 286 if (text.getOriginalHTML() != null) { 287 combinedInputLen += text.getOriginalHTML().length(); 288 } 289 } 290 StringBuilder combinedText = new StringBuilder(combinedTextLen); 291 StringBuilder combinedInput = new StringBuilder(combinedInputLen); 292 while (!textNodes.isEmpty()) { 293 HtmlDocument.Text text = textNodes.removeFirst(); 294 combinedText.append(text.getText()); 295 if (text.getOriginalHTML() != null) { 296 combinedInput.append(text.getOriginalHTML()); 297 } 298 } 299 String originalInput = combinedInputLen > 0 ? combinedInput.toString() : null; 300 output.add(HtmlDocument.createText(combinedText.toString(), originalInput)); 301 } 302 } 303 } 304 305 //------------------------------------------------------------------------ 306 // Text scanning 307 //------------------------------------------------------------------------ 308 /** 309 * A truncated entity is something like <pre>&nbs or a3</pre>. 310 * We only want to find these at the end of a clipped text. 311 */ 312 private static final Pattern TRUNCATED_ENTITY = 313 Pattern.compile("\\& \\#? [0-9a-zA-Z]{0,8} $", Pattern.COMMENTS); 314 315 /** 316 * In a text mode, scan for a tag 317 * @param start Position in original html. 318 * @param end Position in original html. 319 * @return End position of scanned content. 320 */ scanText(final int start, final int end)321 int scanText(final int start, final int end) { 322 int pos; 323 for (pos = start; pos < end; pos++) { 324 char ch = html.charAt(pos); 325 if (ch == '<' && pos + 1 < end) { 326 // Check the next char 327 ch = html.charAt(pos + 1); 328 if (ch == '/' || Character.isLetter(ch) || ch == '!' || ch == '?') { 329 330 // Check if it's an html comment or tag 331 if (html.regionMatches(pos + 1, "!--", 0, 3)) { 332 state = State.IN_COMMENT; 333 } else { 334 state = State.IN_TAG; 335 } 336 break; 337 } 338 } 339 } 340 341 if (pos > start) { 342 int finalPos = pos; 343 String htmlTail = this.html.substring(start, finalPos); 344 345 if ((pos == clipLength) && (clipLength < html.length())) { 346 // We're clipping this HTML, not running off the end. 347 // If we're ending with what looks like a truncated entity, 348 // then clip that part off, too. 349 // If it really was a truncated entity, great. 350 // If it was a false positive, the user won't notice that we clipped 351 // an additional handful of characters. 352 Matcher matcher = TRUNCATED_ENTITY.matcher(htmlTail); 353 if (matcher.find()) { 354 int matchStart = matcher.start(); 355 // The matcher matched in htmlTail, not html. 356 // htmlTail starts at html[start] 357 finalPos = start + matchStart; 358 htmlTail = htmlTail.substring(0, matchStart); 359 } 360 } 361 362 if (finalPos > start) { 363 String originalHtml = null; 364 if (preserveAll) { 365 originalHtml = htmlTail; 366 } else if (preserveValidHtml) { 367 // the only way htmlTail can start with '<' is if it's the last character 368 // in html; otherwise, we would have entered State.IN_TAG or 369 // State.IN_COMMENT above 370 371 // officially a '<' can be valid in a text node, but to be safe we 372 // always escape them 373 originalHtml = CharMatcher.is('<').replaceFrom(htmlTail, "<"); 374 } 375 376 HtmlDocument.Text textnode = HtmlDocument.createEscapedText(htmlTail, originalHtml); 377 nodes.add(textnode); 378 } 379 } 380 return pos; 381 } 382 383 //------------------------------------------------------------------------ 384 // Tag name scanning utility class 385 //------------------------------------------------------------------------ 386 private static class TagNameScanner { 387 private final String html; 388 private String tagName; 389 private int startNamePos = -1; 390 private int endNamePos = -1; 391 TagNameScanner(String html)392 public TagNameScanner(String html) { 393 this.html = html; 394 } 395 396 /** 397 * Scans for a tag name. Sets #startNamePos and #endNamePos. 398 * @param start Position in original html. 399 * @param end Position in original html. 400 * @return End position of scanned content. 401 */ scanName(final int start, final int end)402 public int scanName(final int start, final int end) { 403 int pos; 404 for (pos = start; pos < end; pos++) { 405 char ch = html.charAt(pos); 406 407 // End of tag or end of name. 408 if ((ch == '>') || (ch == '/') || Character.isWhitespace(ch)) { 409 break; 410 } 411 } 412 if (pos > start) { 413 startNamePos = start; 414 endNamePos = pos; 415 } 416 return pos; 417 } 418 419 /** 420 * @return Tag name. 421 */ getTagName()422 public String getTagName() { 423 if (tagName == null && startNamePos != -1 && endNamePos != -1) { 424 tagName = html.substring(startNamePos, endNamePos); 425 } 426 return tagName; 427 } 428 } 429 430 //------------------------------------------------------------------------ 431 // Attribute scanning utility class 432 //------------------------------------------------------------------------ 433 private static class AttributeScanner { 434 private final String html; 435 private String name; 436 private String value; 437 438 // The following have package visibility because they are accessed from 439 // HtmlParser.addAttribute() to handle preservation of original content 440 // around the attribute value, but quoting and escaping of the value itself. 441 int startNamePos = -1; 442 int endNamePos = -1; 443 int startValuePos = -1; 444 int endValuePos = -1; 445 boolean attrValueIsQuoted = false; 446 AttributeScanner(String html)447 public AttributeScanner(String html) { 448 this.html = html; 449 } 450 451 /** 452 * Reset to scan another attribute. 453 */ reset()454 public void reset() { 455 startNamePos = -1; 456 endNamePos = -1; 457 startValuePos = -1; 458 endValuePos = -1; 459 attrValueIsQuoted = false; 460 name = null; 461 value = null; 462 } 463 464 /** 465 * Scans for a tag attribute name. Sets startNamePos and endNamePos. Sets 466 * 'attrName'. 467 * 468 * @param start Position in original html 469 * @param end Position in original html 470 * @return End position of scanned content 471 */ scanName(final int start, final int end)472 int scanName(final int start, final int end) { 473 X.assertTrue(html.charAt(start) != '>'); 474 if (start == end) { 475 // No attribute name 476 return start; 477 } 478 479 int pos; 480 for (pos = start + 1; pos < end; pos++) { 481 char ch = html.charAt(pos); 482 483 // End of tag or end of name. 484 if ((ch == '>') || (ch == '=') || (ch == '/') || Character.isWhitespace(ch)) { 485 break; 486 } 487 } 488 startNamePos = start; 489 endNamePos = pos; 490 return pos; 491 } 492 493 /** 494 * Scans for a tag attribute value. Sets startValuePos_ and endValuePos_. 495 * 496 * @param start Position in original html 497 * @param end Position in original html 498 * @return End position of scanned content 499 */ scanValue(final int start, final int end)500 int scanValue(final int start, final int end) { 501 // Skip whitespace before '='. 502 int pos = skipSpaces(start, end); 503 504 // Handle cases with no attribute value. 505 if ((pos == end) || (html.charAt(pos) != '=')) { 506 // Return start so spaces will be parsed as part of next attribute, 507 // or end of tag. 508 return start; 509 } 510 511 // Skip '=' and whitespace after it. 512 pos++; 513 pos = skipSpaces(pos, end); 514 515 // Handle cases with no attribute value. 516 if (pos == end) { 517 return pos; 518 } 519 520 // Check for quote character ' or " 521 char ch = html.charAt(pos); 522 if (ch == '\'' || ch == '\"') { 523 attrValueIsQuoted = true; 524 pos++; 525 int valueStart = pos; 526 while (pos < end && html.charAt(pos) != ch) { 527 pos++; 528 } 529 startValuePos = valueStart; 530 endValuePos = pos; 531 if (pos < end) { 532 pos++; // Skip the ending quote char 533 } 534 } else { 535 int valueStart = pos; 536 for (; pos < end; pos++) { 537 ch = html.charAt(pos); 538 539 // End of tag or end of value. Not that '/' is included in the value 540 // even if it is the '/>' at the end of the tag. 541 if ((ch == '>') || Character.isWhitespace(ch)) { 542 break; 543 } 544 } 545 startValuePos = valueStart; 546 endValuePos = pos; 547 } 548 549 X.assertTrue(startValuePos > -1); 550 X.assertTrue(endValuePos > -1); 551 X.assertTrue(startValuePos <= endValuePos); 552 X.assertTrue(pos <= end); 553 554 return pos; 555 } 556 557 /** 558 * Skips white spaces. 559 * 560 * @param start Position in original html 561 * @param end Position in original html 562 * @return End position of scanned content 563 */ skipSpaces(final int start, final int end)564 private int skipSpaces(final int start, final int end) { 565 int pos; 566 for (pos = start; pos < end; pos++) { 567 if (!Character.isWhitespace(html.charAt(pos))) { 568 break; 569 } 570 } 571 return pos; 572 } 573 getName()574 public String getName() { 575 if (name == null && startNamePos != -1 && endNamePos != -1) { 576 name = html.substring(startNamePos, endNamePos); 577 } 578 return name; 579 } 580 getValue()581 public String getValue() { 582 if (value == null && startValuePos != -1 && endValuePos != -1) { 583 value = html.substring(startValuePos, endValuePos); 584 } 585 return value; 586 } 587 } 588 589 /** 590 * Holds any unrecognized elements we encounter. Only applicable in 591 * PRESERVE_ALL mode. 592 */ 593 private final HashMap<String,HTML.Element> unknownElements = Maps.newHashMap(); 594 595 /** 596 * Holds any unrecognized attributes we encounter. Only applicable in 597 * PRESERVE_ALL mode. 598 */ 599 private final HashMap<String,HTML.Attribute> unknownAttributes = Maps.newHashMap(); 600 601 /** 602 * @param name Element name. 603 * @return "Dummy" element. Not useful for any real HTML processing, but 604 * gives us a placeholder for tracking original HTML contents. 605 */ lookupUnknownElement(String name)606 private HTML.Element lookupUnknownElement(String name) { 607 name = name.toLowerCase(); 608 HTML.Element result = unknownElements.get(name); 609 if (result == null) { 610 result = new HTML.Element(name, 611 HTML.Element.NO_TYPE, 612 /* empty */ false, 613 /* optional end tag */ true, 614 /* breaks flow*/ false, 615 HTML.Element.Flow.NONE); 616 unknownElements.put(name, result); 617 } 618 return result; 619 } 620 621 /** 622 * @param name Attribute name. 623 * @return "Dummy" attribute. Not useful for any real HTML processing, but 624 * gives us a placeholder for tracking original HTML contents. 625 */ lookupUnknownAttribute(String name)626 private HTML.Attribute lookupUnknownAttribute(String name) { 627 name = name.toLowerCase(); 628 HTML.Attribute result = unknownAttributes.get(name); 629 if (result == null) { 630 result = new HTML.Attribute(name, HTML.Attribute.NO_TYPE); 631 unknownAttributes.put(name, result); 632 } 633 return result; 634 } 635 636 /** 637 * Scans for an HTML tag. 638 * 639 * @param start Position in original html. 640 * @param end Position in original html. 641 * @return End position of scanned content. 642 */ scanTag(final int start, final int end)643 int scanTag(final int start, final int end) { 644 X.assertTrue(html.charAt(start) == '<'); 645 646 // nameStart is where we begin scanning for the tag name and attributes, 647 // so we skip '<'. 648 int nameStart = start + 1; 649 650 // Next state is Text, except the case when we see a STYLE/SCRIPT tag. See 651 // code below. 652 state = State.IN_TEXT; 653 654 // End tag? 655 boolean isEndTag = false; 656 if (html.charAt(nameStart) == '/') { 657 isEndTag = true; 658 ++nameStart; 659 } 660 661 // Tag name and element 662 TagNameScanner tagNameScanner = new TagNameScanner(html); 663 int pos = tagNameScanner.scanName(nameStart, end); 664 String tagName = tagNameScanner.getTagName(); 665 HTML.Element element = null; 666 if (tagName == null) { 667 // For some reason, browsers treat start and end tags differently 668 // when they don't have a valid tag name - end tags are swallowed 669 // (e.g., "</ >"), start tags treated as text (e.g., "< >") 670 if (!isEndTag) { 671 // This is not really a tag, treat the '<' as text. 672 HtmlDocument.Text text = HtmlDocument.createText("<", preserveAll ? "<" : null); 673 nodes.add(text); 674 state = State.IN_TEXT; 675 return nameStart; 676 } 677 678 if (preserveAll) { 679 element = lookupUnknownElement(""); 680 } 681 } else { 682 element = lookupElement(tagName); 683 if (element == null) { 684 if (DEBUG) { 685 // Unknown element 686 debug("Unknown element: " + tagName); 687 } 688 if (preserveAll) { 689 element = lookupUnknownElement(tagName); 690 } 691 } 692 } 693 694 // Attributes 695 boolean isSingleTag = false; 696 ArrayList<HtmlDocument.TagAttribute> attributes = null; 697 int allAttributesStartPos = pos; 698 int nextAttributeStartPos = pos; 699 AttributeScanner attributeScanner = new AttributeScanner(html); 700 while (pos < end) { 701 int startPos = pos; 702 char ch = html.charAt(pos); 703 704 // Are we at the end of the tag? 705 if ((pos + 1 < end) && (ch == '/') && (html.charAt(pos + 1) == '>')) { 706 isSingleTag = true; 707 ++pos; 708 break; // Done 709 } 710 if (ch == '>') { 711 break; // Done 712 } 713 714 // See bug 870742 (Buganizer). 715 if (isEndTag && ('<' == ch)) { 716 // '<' not allowed in end tag, so we finish processing this tag and 717 // return to State.IN_TEXT. We mimic Safari & Firefox, which both 718 // terminate the tag when it contains a '<'. 719 if (element != null) { 720 addEndTag(element, start, allAttributesStartPos, pos); 721 } 722 state = State.IN_TEXT; 723 return pos; 724 } 725 726 if (Character.isWhitespace(ch)) { 727 // White space, skip it. 728 ++pos; 729 } else { 730 // Scan for attribute 731 attributeScanner.reset(); 732 pos = attributeScanner.scanName(pos, end); 733 X.assertTrue(pos > startPos); 734 735 // If it's a valid attribute, scan attribute values 736 if (attributeScanner.getName() != null) { 737 pos = attributeScanner.scanValue(pos, end); 738 739 // Add the attribute to the list 740 if (element != null) { 741 if (attributes == null) { 742 attributes = new ArrayList<HtmlDocument.TagAttribute>(); 743 } 744 addAttribute(attributes, attributeScanner, nextAttributeStartPos, pos); 745 } 746 nextAttributeStartPos = pos; 747 } 748 } 749 750 // Make sure that we make progress! 751 X.assertTrue(pos > startPos); 752 } 753 754 // Cannot find the close tag, so we treat this as text 755 if (pos == end) { 756 X.assertTrue(start < end); 757 String textNodeContent = html.substring(start, end); 758 String originalContent = null; 759 if (preserveAll) { 760 originalContent = textNodeContent; 761 } else if (preserveValidHtml) { 762 // Officially a '<' can be valid in a text node, but to be safe we 763 // always escape them. 764 originalContent = 765 CharMatcher.is('<').replaceFrom(html.substring(start, end), "<"); 766 } 767 nodes.add(HtmlDocument.createEscapedText(textNodeContent, originalContent)); 768 return end; 769 } 770 771 // Skip '>' 772 X.assertTrue(html.charAt(pos) == '>'); 773 pos++; 774 775 // Check if it's an element we're keeping (either an HTML4 element, or an 776 // unknown element we're preserving). If not, ignore the tag. 777 if (element != null) { 778 if (isEndTag) { 779 addEndTag(element, start, allAttributesStartPos, pos); 780 } else { 781 // Special case: if it's a STYLE/SCRIPT element, we go to into 782 // CDATA state. 783 if (HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element)) { 784 state = State.IN_CDATA; 785 } 786 787 addStartTag(element, start, allAttributesStartPos, 788 nextAttributeStartPos, 789 pos, isSingleTag, attributes); 790 } 791 } 792 793 return pos; 794 } 795 796 /** 797 * Lookups the element in our whitelist(s). Whitelists are consulted in 798 * reverse chronological order (starting from the most recently added 799 * whitelist), allowing clients to override the default behavior. 800 * 801 * @param name Element name. 802 * @return Element. 803 */ 804 HTML.Element lookupElement(String name) { 805 ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size()); 806 while (iter.hasPrevious()) { 807 HTML.Element elem = iter.previous().lookupElement(name); 808 if (elem != null) { 809 return elem; 810 } 811 } 812 return null; 813 } 814 815 /** 816 * Lookups the attribute in our whitelist(s). Whitelists are consulted in 817 * reverse chronological order (starting from the most recently added 818 * whitelist), allowing clients to override the default behavior. 819 * 820 * @param name Attribute name. 821 * @return Attribute. 822 */ 823 HTML.Attribute lookupAttribute(String name) { 824 ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size()); 825 while (iter.hasPrevious()) { 826 HTML.Attribute attr = iter.previous().lookupAttribute(name); 827 if (attr != null) { 828 return attr; 829 } 830 } 831 return null; 832 } 833 834 /** 835 * @param element Tag element 836 * @param startPos Start of tag, including '<' 837 * @param startAttributesPos Start of attributes. This is the first 838 * character after the tag name. If there are no attributes, this is the end 839 * of the tag. 840 * @param endAttributesPos First position after last attribute 841 * @param endPos End of tag, including '>' character 842 * @param isSingleTag True iff this is a self-terminating tag 843 * @param attributes Tag attributes 844 */ 845 private void addStartTag(HTML.Element element, final int startPos, 846 final int startAttributesPos, final int endAttributesPos, 847 final int endPos, final boolean isSingleTag, 848 ArrayList<HtmlDocument.TagAttribute> attributes) { 849 X.assertTrue(startPos < startAttributesPos); 850 X.assertTrue(startAttributesPos <= endAttributesPos); 851 X.assertTrue(endAttributesPos <= endPos); 852 853 if (preserveAll) { 854 String beforeAttrs = html.substring(startPos, startAttributesPos); 855 String afterAttrs = html.substring(endAttributesPos, endPos); 856 HtmlDocument.Tag tag = (isSingleTag) 857 ? HtmlDocument.createSelfTerminatingTag(element, attributes, 858 beforeAttrs, afterAttrs) 859 : HtmlDocument.createTag(element, attributes, 860 beforeAttrs, afterAttrs); 861 nodes.add(tag); 862 } else if (preserveValidHtml) { 863 // This is the beginning of the tag up through the tag name. It should not 864 // be possible for this to contain characters needing escaping, but we add 865 // this redundant check to avoid an XSS attack that might get past our 866 // parser but trick a browser into executing a script. 867 X.assertTrue(html.charAt(startPos) == '<'); 868 StringBuilder beforeAttrs = new StringBuilder("<"); 869 String tagName = html.substring(startPos + 1, startAttributesPos); 870 beforeAttrs.append(CharEscapers.asciiHtmlEscaper().escape(tagName)); 871 872 // Verify end-of-tag characters 873 int endContentPos = endPos - 1; 874 X.assertTrue(html.charAt(endContentPos) == '>'); 875 if (isSingleTag) { 876 --endContentPos; 877 X.assertTrue(html.charAt(endContentPos) == '/'); 878 } 879 X.assertTrue(endAttributesPos <= endContentPos); 880 881 // This is any extra characters between the last attribute and the end of 882 // the tag. 883 X.assertTrue(endAttributesPos < endPos); 884 String afterAttrs = html.substring(endAttributesPos, endPos); 885 886 // Strip all but preceding whitespace. 887 HtmlDocument.Tag tag = (isSingleTag) 888 ? HtmlDocument.createSelfTerminatingTag(element, attributes, 889 beforeAttrs.toString(), afterAttrs) 890 : HtmlDocument.createTag(element, attributes, 891 beforeAttrs.toString(), afterAttrs); 892 nodes.add(tag); 893 } else { 894 // Normalize. 895 HtmlDocument.Tag tag = (isSingleTag) 896 ? HtmlDocument.createSelfTerminatingTag(element, attributes) 897 : HtmlDocument.createTag(element, attributes); 898 nodes.add(tag); 899 } 900 } 901 902 /** 903 * @param element End tag element. 904 * @param startPos Start of tag, including '<'. 905 * @param startAttributesPos Start of attributes. This is the first 906 * character after the tag name. If there are no attributes, this is the end 907 * of the tag. 908 * @param endPos End of tag. This usually contains the '>' character, but in 909 * the case where browsers force a termination of a malformed tag, it doesn't. 910 */ 911 private void addEndTag(HTML.Element element, final int startPos, 912 final int startAttributesPos, final int endPos) { 913 X.assertTrue(element != null); 914 X.assertTrue(html.charAt(startPos) == '<'); 915 X.assertTrue(html.charAt(startPos + 1) == '/'); 916 917 if (preserveAll) { 918 // Preserve all: keep actual content even if it's malformed. 919 X.assertTrue(startPos < endPos); 920 String content = html.substring(startPos, endPos); 921 nodes.add(HtmlDocument.createEndTag(element, content)); 922 } else if (preserveValidHtml) { 923 // Preserve valid: terminate the tag. 924 925 StringBuilder validContent = new StringBuilder("</"); 926 927 // This is the beginning of the tag up through the tag name. It should not 928 // be possible for this to contain characters needing escaping, but we add 929 // this redundant check to avoid an XSS attack that might get past our 930 // parser but trick a browser into executing a script. 931 X.assertTrue(startPos < startAttributesPos); 932 String tagName = html.substring(startPos + 2, startAttributesPos); 933 validContent.append(CharEscapers.asciiHtmlEscaper().escape(tagName)); 934 935 // This is the rest of the tag, including any attributes. 936 // See bug 874396 (Buganizer). We don't allow attributes in an end tag. 937 X.assertTrue(startAttributesPos <= endPos); 938 String endOfTag = html.substring(startAttributesPos, endPos); 939 if (endOfTag.charAt(endOfTag.length() - 1) != '>') { 940 endOfTag += '>'; 941 } 942 943 // Strip everything but leading whitespace. 944 validContent.append(endOfTag.replaceAll("\\S+.*>", ">")); 945 946 nodes.add(HtmlDocument.createEndTag(element, validContent.toString())); 947 } else { 948 // Normalize: ignore the original content. 949 nodes.add(HtmlDocument.createEndTag(element)); 950 } 951 } 952 953 /** 954 * Creates and adds an attribute to the list. 955 * 956 * @param attributes Destination of new attribute. 957 * @param scanner Scanned attribute. 958 * @param startPos start position (inclusive) in original HTML of this 959 * attribute, including preceeding separator characters (generally this 960 * is whitespace, but it might contain other characters). This is the 961 * end position of the tag name or previous attribute +1. 962 * @param endPos end position (exclusive) in original HTML of this attribute. 963 */ 964 private void addAttribute(ArrayList<HtmlDocument.TagAttribute> attributes, 965 AttributeScanner scanner, final int startPos, final int endPos) { 966 X.assertTrue(startPos < endPos); 967 968 String name = scanner.getName(); 969 X.assertTrue(name != null); 970 HTML.Attribute htmlAttribute = lookupAttribute(name); 971 972 // This can be null when there's no value, e.g., input.checked attribute. 973 String value = scanner.getValue(); 974 975 if (htmlAttribute == null) { 976 // Unknown attribute. 977 if (DEBUG) { 978 debug("Unknown attribute: " + name); 979 } 980 if (preserveAll) { 981 String original = html.substring(startPos, endPos); 982 attributes.add(HtmlDocument.createTagAttribute( 983 lookupUnknownAttribute(name), value, original)); 984 } 985 } else { 986 String unescapedValue = (value == null) ? null : StringUtil.unescapeHTML(value); 987 if (preserveAll) { 988 attributes.add(HtmlDocument.createTagAttribute(htmlAttribute, 989 unescapedValue, html.substring(startPos, endPos))); 990 } else if (preserveValidHtml) { 991 StringBuilder original = new StringBuilder(); 992 993 // This includes any separator characters between the tag name or 994 // preceding attribute and this one. 995 // This addresses bugs 870757 and 875303 (Buganizer). 996 // Don't allow non-whitespace separators between attributes. 997 X.assertTrue(startPos <= scanner.startNamePos); 998 String originalPrefix = html.substring( 999 startPos, scanner.startNamePos).replaceAll("\\S+", ""); 1000 if (originalPrefix.length() == 0) { 1001 originalPrefix = " "; 1002 } 1003 original.append(originalPrefix); 1004 1005 if (value == null) { 1006 // This includes the name and any following whitespace. Escape in case 1007 // the name has any quotes or '<' that could confuse a browser. 1008 X.assertTrue(scanner.startNamePos < endPos); 1009 String nameEtc = html.substring(scanner.startNamePos, endPos); 1010 original.append(CharEscapers.asciiHtmlEscaper().escape(nameEtc)); 1011 } else { 1012 // Escape name in case the name has any quotes or '<' that could 1013 // confuse a browser. 1014 original.append(CharEscapers.asciiHtmlEscaper().escape(name)); 1015 1016 // This includes the equal sign, and any other whitespace 1017 // between the name and value. It also contains the opening quote 1018 // character if there is one. 1019 X.assertTrue(scanner.endNamePos < scanner.startValuePos); 1020 original.append(html.substring(scanner.endNamePos, scanner.startValuePos)); 1021 1022 // This is the value, excluding any quotes. 1023 if (scanner.attrValueIsQuoted) { 1024 // Officially a '<' can be valid in an attribute value, but to be 1025 // safe we always escape them. 1026 original.append(value.replaceAll("<", "<")); 1027 } else { 1028 // This addresses bug 881426 (Buganizer). Put quotes around any 1029 // dangerous characters, which is what most of the browsers do. 1030 if (NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX.matcher(value).find()) { 1031 original.append('"'); 1032 original.append(value.replaceAll("\"", """)); 1033 original.append('"'); 1034 } else { 1035 original.append(value); 1036 } 1037 } 1038 1039 // This includes end quote, if applicable. 1040 X.assertTrue(scanner.endValuePos <= endPos); 1041 original.append(html.substring(scanner.endValuePos, endPos)); 1042 } 1043 1044 attributes.add(HtmlDocument.createTagAttribute( 1045 htmlAttribute, unescapedValue, original.toString())); 1046 } else { 1047 attributes.add(HtmlDocument.createTagAttribute( 1048 htmlAttribute, unescapedValue)); 1049 } 1050 } 1051 } 1052 1053 //------------------------------------------------------------------------ 1054 // Comment scanning 1055 //------------------------------------------------------------------------ 1056 private static final String START_COMMENT = "<!--"; 1057 private static final String END_COMMENT = "-->"; 1058 1059 private int scanComment(final int start, final int end) { 1060 1061 X.assertTrue(html.regionMatches(start, START_COMMENT, 0, START_COMMENT.length())); 1062 1063 // Scan for end of comment 1064 int pos = html.indexOf(END_COMMENT, start + START_COMMENT.length()); 1065 if (pos != -1) { 1066 pos += END_COMMENT.length(); 1067 } else { 1068 // Look for '>'. If we can't find that, the rest of the text is comments. 1069 pos = html.indexOf('>', start + 4); 1070 if (pos != -1) { 1071 ++pos; 1072 } else { 1073 pos = end; 1074 } 1075 } 1076 1077 if (preserveAll) { 1078 nodes.add(HtmlDocument.createHtmlComment(html.substring(start, pos))); 1079 } 1080 1081 return pos; 1082 } 1083 1084 //------------------------------------------------------------------------ 1085 // CDATA scanning 1086 //------------------------------------------------------------------------ 1087 int scanCDATA(final int start, final int end) { 1088 1089 // Get the tag: must be either STYLE or SCRIPT 1090 HtmlDocument.Tag tag = (HtmlDocument.Tag) nodes.get(nodes.size() - 1); 1091 HTML.Element element = tag.getElement(); 1092 X.assertTrue(HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element)); 1093 1094 int pos; 1095 for (pos = start; pos < end; pos++) { 1096 if (pos + 2 < end && 1097 html.charAt(pos) == '<' && 1098 html.charAt(pos + 1) == '/' && 1099 html.regionMatches(true, pos + 2, element.getName(), 0, 1100 element.getName().length())) { 1101 break; 1102 } 1103 } 1104 1105 // Add a CDATA node 1106 if (pos > start) { 1107 HtmlDocument.CDATA cdata = 1108 HtmlDocument.createCDATA(html.substring(start, pos)); 1109 nodes.add(cdata); 1110 } 1111 1112 state = State.IN_TAG; 1113 return pos; 1114 } 1115 1116 //------------------------------------------------------------------------ 1117 public static void main(String[] args) throws IOException { 1118 1119 DEBUG = true; 1120 1121 String html = new String(ByteStreams.toByteArray(System.in), "ISO-8859-1"); 1122 1123 HtmlParser parser = new HtmlParser(); 1124 HtmlDocument doc = parser.parse(html); 1125 System.out.println(doc.toString()); 1126 } 1127 1128 private static void debug(String str) { 1129 System.err.println(str); 1130 } 1131 }