1 /** 2 * Copyright (c) 2004, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.google.android.mail.common.html.parser; 17 18 import android.text.Spanned; 19 20 import com.google.android.mail.common.base.CharMatcher; 21 import com.google.android.mail.common.base.Preconditions; 22 import com.google.android.mail.common.base.X; 23 import com.google.common.annotations.VisibleForTesting; 24 import com.google.common.collect.ImmutableSet; 25 26 import java.util.ArrayList; 27 import java.util.Arrays; 28 import java.util.Collections; 29 import java.util.List; 30 import java.util.Set; 31 import java.util.Stack; 32 import java.util.logging.Logger; 33 34 /** 35 * HtmlTree represents a parsed and well-formed html text, it provides 36 * methods to convert to plain text. It also provides methods to find 37 * well-formed blocks of text, for quote detection. 38 * 39 * We don't really build a html tree data structure. Instead, for 40 * efficiency, and for the ability to do a simple in-order-traversal 41 * of the tree, we simply keeps a linear list of nodes (in-order). 42 * The begin_ and end_ arrays keeps track of the starting end ending 43 * nodes: 44 * 45 * For a string node, begin_[node] = end_[node] = node 46 * For an open tag, begin_[node] = node, end_[node] = the matching end tag 47 * For a close tag, end_[node] = the matching open tag, end_[node] = node 48 * 49 * @author jlim@google.com (Jing Yee Lim) 50 */ 51 public class HtmlTree { 52 // http://www.w3.org/TR/html4/struct/text.html#h-9.1 53 private static final CharMatcher HTML_WHITESPACE = CharMatcher.anyOf(" \t\f\u200b\r\n"); 54 55 /** 56 * An interface that allows clients to provide their own implementation 57 * of a {@link Converter}. 58 */ 59 public static interface ConverterFactory { 60 /** 61 * Creates a new instance of a {@link Converter} to convert 62 * the contents of an {@link HtmlTree} to some resulting object. 63 */ createInstance()64 Converter createInstance(); 65 } 66 67 /** 68 * An interface for an object which converts a single HtmlTree into some object 69 */ 70 public static interface Converter<T> { 71 /** 72 * Adds the given node {@code n} to plain text. 73 * 74 * @param n The node to convert to text. 75 * @param nodeNum The number of the node among the list of all notes. 76 * @param endNum The number of the ending node if this is a start node, 77 * otherwise the same as {@code nodeNum}. 78 */ addNode(HtmlDocument.Node n, int nodeNum, int endNum)79 void addNode(HtmlDocument.Node n, int nodeNum, int endNum); 80 81 /** 82 * Returns the current length of the plain text. 83 */ getPlainTextLength()84 int getPlainTextLength(); 85 86 /** 87 * Returns the current built object. 88 */ getObject()89 T getObject(); 90 } 91 92 /** A factory that produces converters of the default implementation. */ 93 private static final ConverterFactory DEFAULT_CONVERTER_FACTORY = 94 new ConverterFactory() { 95 @Override 96 public Converter<String> createInstance() { 97 return new DefaultPlainTextConverter(); 98 } 99 }; 100 101 /** Contains html nodes */ 102 private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>(); 103 104 /** Keeps track of beginning and end of each node */ 105 private final Stack<Integer> begins = new Stack<Integer>(); 106 private final Stack<Integer> ends = new Stack<Integer>(); 107 108 /** Plain text (lazy creation) */ 109 private String plainText; 110 111 /** Constructed span (lazy creation) */ 112 private Spanned constructedSpan; 113 114 /** The html string (lazy creation) */ 115 private String html; 116 117 /** textPositions[node pos] = the text position */ 118 private int[] textPositions; 119 120 private ConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY; 121 122 // For debugging only 123 private static final boolean DEBUG = false; 124 125 private static final Logger logger = Logger.getLogger(HtmlTree.class.getName()); 126 127 //------------------------------------------------------------------------ 128 129 /** HtmlTree can only be constructed from this package */ HtmlTree()130 HtmlTree() { 131 } 132 133 /** 134 * Sets a new {@link ConverterFactory} to be used to convert 135 * the contents of this tree to plaintext. 136 */ setConverterFactory(ConverterFactory factory)137 public void setConverterFactory(ConverterFactory factory) { 138 if (factory == null) { 139 throw new NullPointerException("factory must not be null"); 140 } 141 converterFactory = factory; 142 } 143 144 /** 145 * Gets the list of node objects. A node can be either a 146 * Tag, EngTag or a String object. 147 * @return the nodes of the tree 148 */ getNodesList()149 public List<HtmlDocument.Node> getNodesList() { 150 return Collections.unmodifiableList(nodes); 151 } 152 153 /** 154 * @return number of nodes 155 */ getNumNodes()156 public int getNumNodes() { 157 return nodes.size(); 158 } 159 160 /** 161 * Returns number of matching open tag node, or {@code endTagNodeNum} itself 162 * if it does not point to a closing tag. 163 */ findOpenTag(int endTagNodeNum)164 public int findOpenTag(int endTagNodeNum) { 165 X.assertTrue(endTagNodeNum >= 0 && endTagNodeNum < nodes.size()); 166 return begins.get(endTagNodeNum); 167 } 168 169 /** 170 * Returns number of matching closing tag node, or {@code openTagNodeNum} itself 171 * if it does not point to an open tag or points to an open tag with no closing one. 172 */ findEndTag(int openTagNodeNum)173 public int findEndTag(int openTagNodeNum) { 174 X.assertTrue(openTagNodeNum >= 0 && openTagNodeNum < nodes.size()); 175 return ends.get(openTagNodeNum); 176 } 177 178 /** 179 * Returns number of matching open/closing tag node, or {@code tagNodeNum} itself 180 * if it does not point to an open/closing tag (e.g text node or comment). 181 */ findPairedTag(int tagNodeNum)182 public int findPairedTag(int tagNodeNum) { 183 X.assertTrue(tagNodeNum >= 0 && tagNodeNum < nodes.size()); 184 int openNodeNum = begins.get(tagNodeNum); 185 int endNodeNum = ends.get(tagNodeNum); 186 return tagNodeNum == openNodeNum ? endNodeNum : openNodeNum; 187 } 188 189 /** 190 * Gets the entire html. 191 */ getHtml()192 public String getHtml() { 193 return getHtml(-1); 194 } 195 196 /** 197 * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the 198 * specified size. 199 */ getHtml(int wrapSize)200 public String getHtml(int wrapSize) { 201 if (html == null) { 202 html = getHtml(0, nodes.size(), wrapSize); 203 } 204 return html; 205 } 206 207 /** Gets parts of the html */ getHtml(int fromNode, int toNode)208 public String getHtml(int fromNode, int toNode) { 209 return getHtml(fromNode, toNode, -1); 210 } 211 212 /** 213 * Gets parts of the html, if wrapSize is > 0, it tries 214 * to do wrapping at the specified size. 215 */ getHtml(int fromNode, int toNode, int wrapSize)216 public String getHtml(int fromNode, int toNode, int wrapSize) { 217 X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 218 219 int estSize = (toNode - fromNode) * 10; 220 StringBuilder sb = new StringBuilder(estSize); 221 int lastWrapIndex = 0; // used for wrapping 222 for (int n = fromNode; n < toNode; n++) { 223 HtmlDocument.Node node = nodes.get(n); 224 node.toHTML(sb); 225 // TODO: maybe we can be smarter about this and not add newlines 226 // within <pre> tags, unless the whole long line is encompassed 227 // by the <pre> tag. 228 if (wrapSize > 0) { 229 // We can only wrap if the last outputted node is an element that 230 // breaks the flow. Otherwise, we risk the possibility of inserting 231 // spaces where they shouldn't be. 232 if ((node instanceof HtmlDocument.Tag && 233 ((HtmlDocument.Tag) node).getElement().breaksFlow()) || 234 (node instanceof HtmlDocument.EndTag && 235 ((HtmlDocument.EndTag) node).getElement().breaksFlow())) { 236 // Check to see if there is a newline in the most recent node's html. 237 int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n'); 238 if (recentNewLine != -1) { 239 lastWrapIndex += recentNewLine; 240 } 241 // If the last index - last index of a newline is greater than 242 // wrapSize, add a newline. 243 if (((sb.length() - 1) - lastWrapIndex) > wrapSize) { 244 sb.append('\n'); 245 lastWrapIndex = sb.length() - 1; 246 } 247 } 248 } 249 } 250 251 return sb.toString(); 252 } 253 254 /** 255 * Convert a html region into chunks of html code, each containing 256 * roughly chunkSize characters. 257 */ getHtmlChunks(int fromNode, int toNode, int chunkSize)258 public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) { 259 X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 260 261 ArrayList<String> chunks = new ArrayList<String>(); 262 263 // Do a best effort attempt to not split apart certain elements (as of now, 264 // just the <textarea>). We cannot guarantee that they will not be split 265 // because the client may specify endpoint nodes that land in the middle 266 // of an element (although this shouldn't happen if the endpoints returned 267 // by createBlocks() are properly used). 268 int stack = 0; 269 boolean balanced = true; 270 271 StringBuilder sb = new StringBuilder(chunkSize + 256); 272 for (int n = fromNode; n < toNode; n++) { 273 HtmlDocument.Node node = nodes.get(n); 274 node.toHTML(sb); 275 276 if (node instanceof HtmlDocument.Tag) { 277 if (HTML4.TEXTAREA_ELEMENT.equals( 278 ((HtmlDocument.Tag) node).getElement())) { 279 stack++; 280 } 281 } 282 if (node instanceof HtmlDocument.EndTag) { 283 if (HTML4.TEXTAREA_ELEMENT.equals( 284 ((HtmlDocument.EndTag) node).getElement())) { 285 if (stack == 0) { 286 balanced = false; 287 } else { 288 stack--; 289 } 290 } 291 } 292 293 if (stack == 0 && sb.length() >= chunkSize) { 294 chunks.add(sb.toString()); 295 sb.setLength(0); 296 } 297 } 298 299 // Don't forget the last chunk! 300 if (sb.length() > 0) { 301 chunks.add(sb.toString()); 302 } 303 304 // If the tree is not balanced (cut off in the middle of a node), log 305 // debug data. Clients should fix their code so that the endpoints from 306 // createBlocks() are properly used. 307 if (!balanced || stack != 0) { 308 StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n"); 309 debug.append(getHtml()); 310 debug.append("\nfromNode: ").append(fromNode); 311 debug.append("\ntoNode: ").append(toNode); 312 debug.append("\nNum nodes_: ").append(getNumNodes()); 313 for (String chunk : chunks) { 314 debug.append("\nChunk:\n").append(chunk); 315 } 316 logger.severe(debug.toString()); 317 } 318 319 return chunks; 320 } 321 322 /** 323 * Returns height (maximum length from root to a leaf) of the HTML tree. 324 * @return height of the HTML tree. 325 */ getTreeHeight()326 public int getTreeHeight() { 327 int currentHeight = 0; 328 int maxHeight = 0; 329 330 for (int i = 0; i < nodes.size(); i++) { 331 HtmlDocument.Node node = nodes.get(i); 332 if (node instanceof HtmlDocument.Tag) { 333 currentHeight++; 334 if (currentHeight > maxHeight) { 335 maxHeight = currentHeight; 336 } 337 if (((HtmlDocument.Tag) node).getElement().isEmpty()) { 338 // Empty tags have no closing pair, so decrease counter here. 339 currentHeight--; 340 } 341 } else if (node instanceof HtmlDocument.EndTag) { 342 currentHeight--; 343 } 344 } 345 346 // TODO(anatol): make this value cachable? 347 return maxHeight; 348 } 349 350 //------------------------------------------------------------------------ 351 // Creating well-formed blocks within the html tree. 352 //------------------------------------------------------------------------ 353 /** 354 * A Block represents a region of a html tree that 355 * 1) is well-formed, i.e. for each node in the block, all its descendants 356 * are also contained in the block. So it's safe to wrap the region 357 * within a <table> or <div>, etc. 358 * 2) starts at the beginning of a "line", e.g. a <div>, a <br>. 359 */ 360 public static class Block { 361 /* The starting node */ 362 public int start_node; 363 364 /* The ending node (non-inclusive to the block) */ 365 public int end_node; 366 } 367 368 /** 369 * Creates a list of Blocks, given a text-range. 370 * We may create multiple blocks if one single well-formed Block cannot be 371 * created. 372 * 373 * @param textStart beginning plain-text offset 374 * @param textEnd beginning plain-text offset 375 * @param minNode the smallest node number 376 * @param maxNode the largest node number 377 * @return a list of 0 or more Block objects, never null 378 */ createBlocks(int textStart, int textEnd, int minNode, int maxNode)379 public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) { 380 381 ArrayList<Block> blocks = new ArrayList<Block>(); 382 int startNode = Math.max(getBlockStart(textStart), minNode); 383 int endNode = Math.min(getBlockEnd(textEnd), maxNode); 384 385 if (DEBUG) { 386 debug("Creating block: " + 387 "text pos: " + textStart + "-" + textEnd + "\n" + 388 "node pos: " + startNode + "-" + endNode + "\n" + 389 plainText.substring(textStart, textEnd)); 390 } 391 392 // Split up the block [start, end) into one or more blocks that 393 // are well-formed, and begins at a "line" boundary. 394 int blockStart = -1; 395 for (int n = startNode; n < endNode;) { 396 397 // The node n spans [nBegin, nEnd] 398 int nBegin = begins.get(n); 399 int nEnd = ends.get(n); 400 401 if (blockStart == -1) { 402 // Check if this is a valid start node 403 if (nBegin >= n && nEnd <= endNode && 404 canBeginBlockAt(n)) { 405 blockStart = n; 406 n = nEnd + 1; 407 } else { 408 n++; 409 } 410 continue; 411 } 412 413 // If the node [nBegin, nEnd) lies completely within 414 // the region then proceed to the (nEnd + 1). 415 if (nBegin >= blockStart && nEnd < endNode) { 416 n = nEnd + 1; 417 continue; 418 } 419 420 // If we got here, we have to break up the region into one 421 // or more blocks because the current node cannot be included 422 // in the region. 423 if (DEBUG) { 424 debug("Forcing new block: " + n + " (" + nBegin + " " + nEnd + 425 ") exceeds (" + blockStart + " " + endNode + ")"); 426 } 427 Block b = new Block(); 428 b.start_node = blockStart; 429 b.end_node = n; 430 blocks.add(b); 431 432 blockStart = -1; 433 n++; 434 } 435 436 // Last block 437 if (blockStart != -1) { 438 Block b = new Block(); 439 b.start_node = blockStart; 440 b.end_node = endNode; 441 blocks.add(b); 442 } 443 444 if (DEBUG) { 445 for (int i = 0; i < blocks.size(); i++) { 446 Block b = blocks.get(i); 447 debug("Block " + i + "/" + blocks.size() + ": " + 448 b.start_node + "-" + b.end_node + " " + 449 getPlainText(b.start_node, b.end_node)); 450 } 451 } 452 453 return blocks; 454 } 455 456 /** 457 * Checks if a block can begin starting from a node position 458 */ canBeginBlockAt(int nodePos)459 private boolean canBeginBlockAt(int nodePos) { 460 int textPos = textPositions[nodePos]; 461 462 // Make sure that we don't exceed the text position, this happens 463 // for the last tag nodes. 464 if (textPos == plainText.length()) { 465 textPos--; 466 } 467 468 // Scan backwards to check if a nodePos is at the beginning 469 // of a line. 470 for (int i = textPos; i > 0; i--) { 471 char ch = plainText.charAt(i); 472 if (ch == '\n') { 473 return true; 474 } 475 if (i < textPos && !HTML_WHITESPACE.matches(ch)) { 476 return false; 477 } 478 } 479 return true; 480 } 481 482 /** 483 * Returns the start of a block given a text-pos 484 */ getBlockStart(int textPos)485 private int getBlockStart(int textPos) { 486 int nodenum = Arrays.binarySearch(textPositions, textPos); 487 if (nodenum >= 0) { 488 // Got an exact node alignment. Get the outer most pos that 489 // matches the text position 490 while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) { 491 nodenum--; 492 } 493 } else { 494 // textPos matches the middle of a node. 495 nodenum = -nodenum - 1; 496 } 497 498 X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 499 return nodenum; 500 } 501 502 /** 503 * Returns the end of a block given a text-pos 504 */ getBlockEnd(int textPos)505 private int getBlockEnd(int textPos) { 506 int nodenum = Arrays.binarySearch(textPositions, textPos); 507 if (nodenum >= 0) { 508 // Got an exact node alignment. 509 while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) { 510 nodenum++; 511 } 512 } else { 513 // textPos matches the middle of a node. 514 nodenum = -nodenum - 2; 515 } 516 X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 517 return nodenum; 518 } 519 520 //------------------------------------------------------------------------ 521 // Plain text view of the html tree 522 //------------------------------------------------------------------------ 523 /** 524 * @return the plain-text position corresponding to the node 525 */ getTextPosition(int node)526 public int getTextPosition(int node) { 527 return textPositions[node]; 528 } 529 530 /** 531 * @return a plain-text String of the html tree 532 */ getPlainText()533 public String getPlainText() { 534 if (plainText == null) { 535 convertToPlainText(); 536 } 537 return plainText; 538 } 539 540 /** 541 * @return a plain-text String of a part of the html tree 542 */ getPlainText(int fromNode, int toNode)543 public String getPlainText(int fromNode, int toNode) { 544 if (plainText == null) { 545 convertToPlainText(); 546 } 547 int textstart = textPositions[fromNode]; 548 int textend = textPositions[toNode]; 549 return plainText.substring(textstart, textend); 550 } 551 552 /** 553 * Converts the html tree to plain text. 554 * We simply iterate through the nodes in the tree. 555 * As we output the plain-text, we keep track of the text position 556 * of each node. 557 * For String nodes, we replace '\n' with ' ' unless we're in a 558 * <pre> block. 559 */ convertToPlainText()560 private void convertToPlainText() { 561 X.assertTrue(plainText == null && textPositions == null); 562 563 int numNodes = nodes.size(); 564 565 // Keeps track of start text position of each node, including a last 566 // entry for the size of the text. 567 textPositions = new int[numNodes + 1]; 568 569 Converter<String> converter = (Converter<String>) converterFactory.createInstance(); 570 571 for (int i = 0; i < numNodes; i++) { 572 textPositions[i] = converter.getPlainTextLength(); 573 converter.addNode(nodes.get(i), i, ends.get(i)); 574 } 575 576 // Add a last entry, so that textPositions_[nodes_.size()] is valid. 577 textPositions[numNodes] = converter.getPlainTextLength(); 578 579 plainText = converter.getObject(); 580 581 if (DEBUG) { 582 debug("Plain text: " + plainText); 583 584 for (int i = 0; i < nodes.size(); i++) { 585 int textPos = textPositions[i]; 586 String text = plainText.substring(textPos, textPositions[i + 1]); 587 debug("At " + i + ": pos=" + textPos + " " + text); 588 } 589 } 590 } 591 592 //------------------------------------------------------------------------ 593 // Spanned view of the html tree 594 //------------------------------------------------------------------------ 595 /** 596 * @return a Spanned representation of the html tree 597 */ getSpanned()598 public Spanned getSpanned() { 599 if (constructedSpan == null) { 600 convertToSpan(); 601 } 602 return constructedSpan; 603 } 604 605 /** 606 * Converts the html tree to plain text. 607 * We simply iterate through the nodes in the tree. 608 * As we output the plain-text, we keep track of the text position 609 * of each node. 610 * For String nodes, we replace '\n' with ' ' unless we're in a 611 * <pre> block. 612 */ convertToSpan()613 private void convertToSpan() { 614 X.assertTrue(constructedSpan == null); 615 616 int numNodes = nodes.size(); 617 618 Converter<Spanned> converter = (Converter<Spanned>) converterFactory.createInstance(); 619 620 for (int i = 0; i < numNodes; i++) { 621 converter.addNode(nodes.get(i), i, ends.get(i)); 622 } 623 624 constructedSpan = converter.getObject(); 625 } 626 627 /** 628 * Encapsulates the logic for outputting plain text with respect to text 629 * segments, white space separators, line breaks, and quote marks. 630 */ 631 @VisibleForTesting 632 static final class PlainTextPrinter { 633 /** 634 * Separators are whitespace inserted between segments of text. The 635 * semantics are such that between any two segments of text, there is 636 * at most one separator. As such, separators are ordered in increasing 637 * priority, and setting a separator multiple times between text will 638 * result in the single separator with the highest priority being used. 639 * For example, a LineBreak (one newline) will override a Space, but will 640 * be overriden by a BlankLine (two newlines). 641 */ 642 static enum Separator { 643 // The values here must be ordered by increasing priority, as the 644 // enum's ordinal() method is used when determining if a new separator 645 // should override an existing one. 646 None, 647 Space, // single space 648 LineBreak, // single new line 649 BlankLine // two new lines 650 } 651 652 // White space characters that are collapsed as a single space. 653 // Note that characters such as the non-breaking whitespace 654 // and full-width spaces are not equivalent to the normal spaces. 655 private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f"; 656 657 /** 658 * Determines if the given character is considered an HTML space character. 659 * Consecutive HTML space characters are collapsed into a single space when 660 * not within a PRE element. 661 */ isHtmlWhiteSpace(char ch)662 private static boolean isHtmlWhiteSpace(char ch) { 663 return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0; 664 } 665 666 // The buffer in which we accumulate the converted plain text 667 private final StringBuilder sb = new StringBuilder(); 668 669 // How many <blockquote> blocks we are in. 670 private int quoteDepth = 0; 671 672 // How many logical newlines are at the end of the buffer we've outputted. 673 // Note that we can't simply count the newlines at the end of the output 674 // buffer because a logical new line may be followed by quote marks. 675 // 676 // We initialize the value to 2 so that we consume any initial separators, 677 // since we don't need separators at the beginning of the output. This also 678 // results in correctly outputting any quote marks at the beginning of the 679 // output if the first piece of text is within a BLOCKQUOTE element. 680 private int endingNewLines = 2; 681 682 // The next separator to be inserted between two text nodes. 683 private Separator separator = Separator.None; 684 685 /** Returns the current length of the text. */ getTextLength()686 final int getTextLength() { 687 return sb.length(); 688 } 689 690 /** Returns the current text. */ getText()691 final String getText() { 692 return sb.toString(); 693 } 694 695 /** 696 * Sets the next separator between two text nodes. A Space separator is 697 * used if there is any whitespace between the two text nodes when there is 698 * no intervening element that breaks flow. This is automatically handled 699 * by the {@link #appendNormalText} function so the client never needs to 700 * specify this separator. 701 * <p> 702 * A LineBreak separator (single new line) is used if text segments are 703 * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR, 704 * etc.). The client should set this separator for opening and closing tags 705 * of any element that breaks flow. 706 * <p> 707 * A BlankLine separator (two new lines) should be set for opening and 708 * closing P tags. 709 * <p> 710 * If this method is called multiple times between text nodes, a 711 * separator with a higher priority will override that of a lower priority. 712 */ setSeparator(Separator newSeparator)713 final void setSeparator(Separator newSeparator) { 714 if (newSeparator.ordinal() > separator.ordinal()) { 715 separator = newSeparator; 716 } 717 } 718 719 /** Increments the current quote depth of the text. */ incQuoteDepth()720 final void incQuoteDepth() { 721 quoteDepth++; 722 } 723 724 /** Decrements the current quote depth of the text. */ decQuoteDepth()725 final void decQuoteDepth() { 726 quoteDepth = Math.max(0, quoteDepth - 1); 727 } 728 729 /** 730 * Normalizes the HTML whitespace in the given {@code text} and appends it 731 * as the next segment of text. This will flush any separator that should 732 * be appended before the text, as well as any quote marks that should 733 * follow the last newline if the quote depth is non-zero. 734 */ appendNormalText(String text)735 final void appendNormalText(String text) { 736 if (text.length() == 0) { 737 return; 738 } 739 boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0)); 740 boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1)); 741 742 // Strip beginning and ending whitespace. 743 text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text); 744 745 // Collapse whitespace within the text. 746 text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' '); 747 748 if (startsWithSpace) { 749 setSeparator(Separator.Space); 750 } 751 752 appendTextDirect(text); 753 754 if (endsWithSpace) { 755 setSeparator(Separator.Space); 756 } 757 } 758 759 /** 760 * Appends the given text, preserving all whitespace. This is used for 761 * appending text in a PRE element. 762 */ appendPreText(String text)763 final void appendPreText(String text) { 764 // We're in a <pre> block. Split the text into lines, and append 765 // each line with appendTextDirect() to preserve white space. 766 String[] lines = text.split("[\\r\\n]", -1); 767 768 // split() will always return an array with at least one element. 769 appendTextDirect(lines[0]); 770 771 // For all of the remaining lines, we append a newline first, which 772 // takes care of any quote marks that we need to output if the quote 773 // depth is non-zero. 774 for (int i = 1; i < lines.length; i++) { 775 appendNewLine(); 776 appendTextDirect(lines[i]); 777 } 778 } 779 780 /** 781 * Appends the {@code text} directly to the output, taking into account 782 * any separator that should be appended before it, and any quote marks 783 * that should follow the last newline if the quote depth is non-zero. 784 * <p> 785 * {@code text} must not contain any new lines--in order to handle 786 * quoting correctly, it is up to the caller to either normalize away the 787 * newlines, or split the text up into separate lines and handle new lines 788 * with the {@link #appendNewLine} method. 789 * <p> 790 * The original {@code text} is not modified in any way. Use this method 791 * when you need to preserve the original white space. 792 * <p> 793 * If the given {@code text} is non empty, this method will result in 794 * {@code endingNewLines} being reset to 0. 795 */ appendTextDirect(String text)796 private void appendTextDirect(String text) { 797 if (text.length() == 0) { 798 return; 799 } 800 Preconditions.checkArgument(text.indexOf('\n') < 0, 801 "text must not contain newlines."); 802 flushSeparator(); 803 maybeAddQuoteMarks(true); 804 sb.append(text); 805 endingNewLines = 0; 806 } 807 808 /** 809 * Appends a forced line break, which is the equivalent of a BR element. 810 */ 811 final void appendForcedLineBreak() { 812 flushSeparator(); 813 appendNewLine(); 814 } 815 816 /** 817 * Appends any pending separator to the output buffer. This should be 818 * called before appending text to the buffer. 819 */ 820 private void flushSeparator() { 821 switch (separator) { 822 case Space: 823 if (endingNewLines == 0) { 824 // Only append a space separator if we are not following a new 825 // line character. For example, we don't append a separator 826 // space after a <br> tag, since the <br>'s newline fulfills the 827 // space separation requirement. 828 sb.append(" "); 829 } 830 break; 831 case LineBreak: 832 while (endingNewLines < 1) { 833 appendNewLine(); 834 } 835 break; 836 case BlankLine: 837 while (endingNewLines < 2) { 838 appendNewLine(); 839 } 840 break; 841 } 842 separator = Separator.None; 843 } 844 845 /** 846 * Adds a newline to the output. This handles any quote marks that should 847 * follow any previous new lines, and increments {@code endingNewLines}. 848 */ 849 private void appendNewLine() { 850 maybeAddQuoteMarks(false); 851 sb.append('\n'); 852 endingNewLines++; 853 } 854 855 /** 856 * Adds quote marks to the output if we are at the beginning of a line. 857 * One '>' character is used for every level of quoting we are in. 858 * 859 * @param includeEndingSpace Includes a single space after the quote marks. 860 */ 861 private void maybeAddQuoteMarks(boolean includeEndingSpace) { 862 // We only need to add quote marks if we are at the beginning of line. 863 if (endingNewLines > 0 && quoteDepth > 0) { 864 for (int i = 0; i < quoteDepth; i++) { 865 sb.append('>'); 866 } 867 if (includeEndingSpace) { 868 sb.append(' '); 869 } 870 } 871 } 872 } 873 874 /** 875 * Contains the logic for converting the contents of one HtmlTree into 876 * plaintext. 877 */ 878 public static class DefaultPlainTextConverter implements Converter<String> { 879 880 private static final Set<HTML.Element> BLANK_LINE_ELEMENTS = 881 ImmutableSet.of( 882 HTML4.P_ELEMENT, 883 HTML4.BLOCKQUOTE_ELEMENT, 884 HTML4.PRE_ELEMENT); 885 886 private final PlainTextPrinter printer = new PlainTextPrinter(); 887 888 private int preDepth = 0; 889 private int styleDepth = 0; 890 891 @Override 892 public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) { 893 if (n instanceof HtmlDocument.Text) { // A string node 894 895 HtmlDocument.Text textNode = (HtmlDocument.Text) n; 896 String str = textNode.getText(); 897 898 if (preDepth > 0) { 899 printer.appendPreText(str); 900 901 } else if (styleDepth > 0) { 902 // Append nothing 903 } else { 904 printer.appendNormalText(str); 905 } 906 907 } else if (n instanceof HtmlDocument.Tag) { 908 909 // Check for linebreaking tags. 910 HtmlDocument.Tag tag = (HtmlDocument.Tag) n; 911 HTML.Element element = tag.getElement(); 912 913 if (BLANK_LINE_ELEMENTS.contains(element)) { 914 printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 915 916 } else if (HTML4.BR_ELEMENT.equals(element)) { 917 // The <BR> element is special in that it always adds a newline. printer.appendForcedLineBreak()918 printer.appendForcedLineBreak(); 919 920 } else if (element.breaksFlow()) { 921 // All other elements that break the flow add a LineBreak separator. 922 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 923 924 if (HTML4.HR_ELEMENT.equals(element)) { 925 printer.appendNormalText("________________________________"); 926 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 927 } 928 } 929 930 if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { printer.incQuoteDepth()931 printer.incQuoteDepth(); 932 933 } else if (HTML4.PRE_ELEMENT.equals(element)) { 934 preDepth++; 935 } else if (HTML4.STYLE_ELEMENT.equals(element)) { 936 styleDepth++; 937 } 938 939 } else if (n instanceof HtmlDocument.EndTag) { 940 941 // Check for linebreaking tags. 942 HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n; 943 HTML.Element element = endTag.getElement(); 944 945 if (BLANK_LINE_ELEMENTS.contains(element)) { 946 printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 947 948 } else if (element.breaksFlow()) { 949 // All other elements that break the flow add a LineBreak separator. 950 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 951 } 952 953 if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { printer.decQuoteDepth()954 printer.decQuoteDepth(); 955 956 } else if (HTML4.PRE_ELEMENT.equals(element)) { 957 preDepth--; 958 } else if (HTML4.STYLE_ELEMENT.equals(element)) { 959 styleDepth--; 960 } 961 } 962 } 963 964 @Override getPlainTextLength()965 public final int getPlainTextLength() { 966 return printer.getTextLength(); 967 } 968 969 @Override getObject()970 public final String getObject() { 971 return printer.getText(); 972 } 973 } 974 975 //------------------------------------------------------------------------ 976 // The following methods are used to build the html tree. 977 //------------------------------------------------------------------------ 978 /** For building the html tree */ 979 private Stack<Integer> stack; 980 private int parent; 981 982 /** Starts the build process */ start()983 void start() { 984 stack = new Stack<Integer>(); 985 parent = -1; 986 } 987 988 /** Finishes the build process */ finish()989 void finish() { 990 X.assertTrue(stack.size() == 0); 991 X.assertTrue(parent == -1); 992 } 993 994 /** 995 * Adds a html start tag, there must followed later by a call to addEndTag() 996 * to add the matching end tag 997 */ addStartTag(HtmlDocument.Tag t)998 void addStartTag(HtmlDocument.Tag t) { 999 int nodenum = nodes.size(); 1000 addNode(t, nodenum, -1); 1001 1002 stack.add(parent); 1003 parent = nodenum; 1004 } 1005 1006 /** 1007 * Adds a html end tag, this must be preceded by a previous matching open tag 1008 */ addEndTag(HtmlDocument.EndTag t)1009 void addEndTag(HtmlDocument.EndTag t) { 1010 int nodenum = nodes.size(); 1011 addNode(t, parent, nodenum); 1012 1013 if (parent != -1) { 1014 ends.set(parent, nodenum); 1015 } 1016 1017 parent = stack.pop(); 1018 } 1019 1020 /** Adds a singular tag that does not have a corresponding end tag */ addSingularTag(HtmlDocument.Tag t)1021 void addSingularTag(HtmlDocument.Tag t) { 1022 int nodenum = nodes.size(); 1023 addNode(t, nodenum, nodenum); 1024 } 1025 1026 /** 1027 * Adds a text 1028 * @param t a plain-text string 1029 */ addText(HtmlDocument.Text t)1030 void addText(HtmlDocument.Text t) { 1031 int nodenum = nodes.size(); 1032 addNode(t, nodenum, nodenum); 1033 } 1034 1035 /** Adds a node */ addNode(HtmlDocument.Node n, int begin, int end)1036 private void addNode(HtmlDocument.Node n, int begin, int end) { 1037 nodes.add(n); 1038 begins.add(begin); 1039 ends.add(end); 1040 } 1041 1042 /** For debugging */ debug(String str)1043 private static final void debug(String str) { 1044 logger.finest(str); 1045 } 1046 1047 }