• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2004, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.google.android.mail.common.html.parser;
17 
18 import android.text.Spanned;
19 
20 import com.google.android.mail.common.base.CharMatcher;
21 import com.google.android.mail.common.base.Preconditions;
22 import com.google.android.mail.common.base.X;
23 import com.google.common.annotations.VisibleForTesting;
24 import com.google.common.collect.ImmutableSet;
25 
26 import java.util.ArrayList;
27 import java.util.Arrays;
28 import java.util.Collections;
29 import java.util.List;
30 import java.util.Set;
31 import java.util.Stack;
32 import java.util.logging.Logger;
33 
34 /**
35  * HtmlTree represents a parsed and well-formed html text, it provides
36  * methods to convert to plain text. It also provides methods to find
37  * well-formed blocks of text, for quote detection.
38  *
39  * We don't really build a html tree data structure. Instead, for
40  * efficiency, and for the ability to do a simple in-order-traversal
41  * of the tree, we simply keeps a linear list of nodes (in-order).
42  * The begin_ and end_ arrays keeps track of the starting end ending
43  * nodes:
44  *
45  * For a string node, begin_[node] = end_[node] = node
46  * For an open tag, begin_[node] = node, end_[node] = the matching end tag
47  * For a close tag, end_[node] = the matching open tag, end_[node] = node
48  *
49  * @author jlim@google.com (Jing Yee Lim)
50  */
51 public class HtmlTree {
52   // http://www.w3.org/TR/html4/struct/text.html#h-9.1
53   private static final CharMatcher HTML_WHITESPACE = CharMatcher.anyOf(" \t\f\u200b\r\n");
54 
55   /**
56    * An interface that allows clients to provide their own implementation
57    * of a {@link Converter}.
58    */
59   public static interface ConverterFactory {
60     /**
61      * Creates a new instance of a {@link Converter} to convert
62      * the contents of an {@link HtmlTree} to some resulting object.
63      */
createInstance()64     Converter createInstance();
65   }
66 
67   /**
68    * An interface for an object which converts a single HtmlTree into some object
69    */
70   public static interface Converter<T> {
71     /**
72      * Adds the given node {@code n} to plain text.
73      *
74      * @param n The node to convert to text.
75      * @param nodeNum The number of the node among the list of all notes.
76      * @param endNum The number of the ending node if this is a start node,
77      *    otherwise the same as {@code nodeNum}.
78      */
addNode(HtmlDocument.Node n, int nodeNum, int endNum)79     void addNode(HtmlDocument.Node n, int nodeNum, int endNum);
80 
81     /**
82      * Returns the current length of the plain text.
83      */
getPlainTextLength()84     int getPlainTextLength();
85 
86     /**
87      * Returns the current built object.
88      */
getObject()89     T getObject();
90   }
91 
92   /** A factory that produces converters of the default implementation. */
93   private static final ConverterFactory DEFAULT_CONVERTER_FACTORY =
94       new ConverterFactory() {
95         @Override
96         public Converter<String> createInstance() {
97           return new DefaultPlainTextConverter();
98         }
99       };
100 
101   /** Contains html nodes */
102   private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>();
103 
104   /** Keeps track of beginning and end of each node */
105   private final Stack<Integer> begins = new Stack<Integer>();
106   private final Stack<Integer> ends = new Stack<Integer>();
107 
108   /** Plain text (lazy creation) */
109   private String plainText;
110 
111   /** Constructed span (lazy creation) */
112   private Spanned constructedSpan;
113 
114   /** The html string (lazy creation) */
115   private String html;
116 
117   /** textPositions[node pos] = the text position */
118   private int[] textPositions;
119 
120   private ConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY;
121 
122   // For debugging only
123   private static final boolean DEBUG = false;
124 
125   private static final Logger logger = Logger.getLogger(HtmlTree.class.getName());
126 
127   //------------------------------------------------------------------------
128 
129   /** HtmlTree can only be constructed from this package */
HtmlTree()130   HtmlTree() {
131   }
132 
133   /**
134    * Sets a new {@link ConverterFactory} to be used to convert
135    * the contents of this tree to plaintext.
136    */
setConverterFactory(ConverterFactory factory)137   public void setConverterFactory(ConverterFactory factory) {
138     if (factory == null) {
139       throw new NullPointerException("factory must not be null");
140     }
141     converterFactory = factory;
142   }
143 
144   /**
145    * Gets the list of node objects. A node can be either a
146    * Tag, EngTag or a String object.
147    * @return the nodes of the tree
148    */
getNodesList()149   public List<HtmlDocument.Node> getNodesList() {
150     return Collections.unmodifiableList(nodes);
151   }
152 
153   /**
154    * @return number of nodes
155    */
getNumNodes()156   public int getNumNodes() {
157     return nodes.size();
158   }
159 
160   /**
161    * Returns number of matching open tag node, or {@code endTagNodeNum} itself
162    * if it does not point to a closing tag.
163    */
findOpenTag(int endTagNodeNum)164   public int findOpenTag(int endTagNodeNum) {
165     X.assertTrue(endTagNodeNum >= 0 && endTagNodeNum < nodes.size());
166     return begins.get(endTagNodeNum);
167   }
168 
169   /**
170    * Returns number of matching closing tag node, or {@code openTagNodeNum} itself
171    * if it does not point to an open tag or points to an open tag with no closing one.
172    */
findEndTag(int openTagNodeNum)173   public int findEndTag(int openTagNodeNum) {
174     X.assertTrue(openTagNodeNum >= 0 && openTagNodeNum < nodes.size());
175     return ends.get(openTagNodeNum);
176   }
177 
178   /**
179    * Returns number of matching open/closing tag node, or {@code tagNodeNum} itself
180    * if it does not point to an open/closing tag (e.g text node or comment).
181    */
findPairedTag(int tagNodeNum)182   public int findPairedTag(int tagNodeNum) {
183     X.assertTrue(tagNodeNum >= 0 && tagNodeNum < nodes.size());
184     int openNodeNum = begins.get(tagNodeNum);
185     int endNodeNum = ends.get(tagNodeNum);
186     return tagNodeNum == openNodeNum ? endNodeNum : openNodeNum;
187   }
188 
189   /**
190    * Gets the entire html.
191    */
getHtml()192   public String getHtml() {
193     return getHtml(-1);
194   }
195 
196   /**
197    * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the
198    * specified size.
199    */
getHtml(int wrapSize)200   public String getHtml(int wrapSize) {
201     if (html == null) {
202       html = getHtml(0, nodes.size(), wrapSize);
203     }
204     return html;
205   }
206 
207   /** Gets parts of the html */
getHtml(int fromNode, int toNode)208   public String getHtml(int fromNode, int toNode) {
209     return getHtml(fromNode, toNode, -1);
210   }
211 
212   /**
213    * Gets parts of the html, if wrapSize is > 0, it tries
214    * to do wrapping at the specified size.
215    */
getHtml(int fromNode, int toNode, int wrapSize)216   public String getHtml(int fromNode, int toNode, int wrapSize) {
217     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
218 
219     int estSize = (toNode - fromNode) * 10;
220     StringBuilder sb = new StringBuilder(estSize);
221     int lastWrapIndex = 0;      // used for wrapping
222     for (int n = fromNode; n < toNode; n++) {
223       HtmlDocument.Node node = nodes.get(n);
224       node.toHTML(sb);
225       // TODO: maybe we can be smarter about this and not add newlines
226       // within <pre> tags, unless the whole long line is encompassed
227       // by the <pre> tag.
228       if (wrapSize > 0) {
229         // We can only wrap if the last outputted node is an element that
230         // breaks the flow. Otherwise, we risk the possibility of inserting
231         // spaces where they shouldn't be.
232         if ((node instanceof HtmlDocument.Tag &&
233               ((HtmlDocument.Tag) node).getElement().breaksFlow()) ||
234             (node instanceof HtmlDocument.EndTag &&
235               ((HtmlDocument.EndTag) node).getElement().breaksFlow())) {
236           // Check to see if there is a newline in the most recent node's html.
237           int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n');
238           if (recentNewLine != -1) {
239             lastWrapIndex += recentNewLine;
240           }
241           // If the last index - last index of a newline is greater than
242           // wrapSize, add a newline.
243           if (((sb.length() - 1) - lastWrapIndex) > wrapSize) {
244             sb.append('\n');
245             lastWrapIndex = sb.length() - 1;
246           }
247         }
248       }
249     }
250 
251     return sb.toString();
252   }
253 
254   /**
255    * Convert a html region into chunks of html code, each containing
256    * roughly chunkSize characters.
257    */
getHtmlChunks(int fromNode, int toNode, int chunkSize)258   public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) {
259     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
260 
261     ArrayList<String> chunks = new ArrayList<String>();
262 
263     // Do a best effort attempt to not split apart certain elements (as of now,
264     // just the <textarea>). We cannot guarantee that they will not be split
265     // because the client may specify endpoint nodes that land in the middle
266     // of an element (although this shouldn't happen if the endpoints returned
267     // by createBlocks() are properly used).
268     int stack = 0;
269     boolean balanced = true;
270 
271     StringBuilder sb = new StringBuilder(chunkSize + 256);
272     for (int n = fromNode; n < toNode; n++) {
273       HtmlDocument.Node node = nodes.get(n);
274       node.toHTML(sb);
275 
276       if (node instanceof HtmlDocument.Tag) {
277         if (HTML4.TEXTAREA_ELEMENT.equals(
278             ((HtmlDocument.Tag) node).getElement())) {
279           stack++;
280         }
281       }
282       if (node instanceof HtmlDocument.EndTag) {
283         if (HTML4.TEXTAREA_ELEMENT.equals(
284             ((HtmlDocument.EndTag) node).getElement())) {
285           if (stack == 0) {
286             balanced = false;
287           } else {
288             stack--;
289           }
290         }
291       }
292 
293       if (stack == 0 && sb.length() >= chunkSize) {
294         chunks.add(sb.toString());
295         sb.setLength(0);
296       }
297     }
298 
299     // Don't forget the last chunk!
300     if (sb.length() > 0) {
301       chunks.add(sb.toString());
302     }
303 
304     // If the tree is not balanced (cut off in the middle of a node), log
305     // debug data. Clients should fix their code so that the endpoints from
306     // createBlocks() are properly used.
307     if (!balanced || stack != 0) {
308       StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n");
309       debug.append(getHtml());
310       debug.append("\nfromNode: ").append(fromNode);
311       debug.append("\ntoNode: ").append(toNode);
312       debug.append("\nNum nodes_: ").append(getNumNodes());
313       for (String chunk : chunks) {
314         debug.append("\nChunk:\n").append(chunk);
315       }
316       logger.severe(debug.toString());
317     }
318 
319     return chunks;
320   }
321 
322   /**
323    * Returns height (maximum length from root to a leaf) of the HTML tree.
324    * @return height of the HTML tree.
325    */
getTreeHeight()326   public int getTreeHeight() {
327     int currentHeight = 0;
328     int maxHeight = 0;
329 
330     for (int i = 0; i < nodes.size(); i++) {
331       HtmlDocument.Node node = nodes.get(i);
332       if (node instanceof HtmlDocument.Tag) {
333         currentHeight++;
334         if (currentHeight > maxHeight) {
335           maxHeight = currentHeight;
336         }
337         if (((HtmlDocument.Tag) node).getElement().isEmpty()) {
338           // Empty tags have no closing pair, so decrease counter here.
339           currentHeight--;
340         }
341       } else if (node instanceof HtmlDocument.EndTag) {
342         currentHeight--;
343       }
344     }
345 
346     // TODO(anatol): make this value cachable?
347     return maxHeight;
348   }
349 
350   //------------------------------------------------------------------------
351   // Creating well-formed blocks within the html tree.
352   //------------------------------------------------------------------------
353   /**
354    * A Block represents a region of a html tree that
355    * 1) is well-formed, i.e. for each node in the block, all its descendants
356    * are also contained in the block. So it's safe to wrap the region
357    * within a <table> or <div>, etc.
358    * 2) starts at the beginning of a "line", e.g. a <div>, a <br>.
359    */
360   public static class Block {
361     /* The starting node */
362     public int start_node;
363 
364     /* The ending node (non-inclusive to the block) */
365     public int end_node;
366   }
367 
368   /**
369    * Creates a list of Blocks, given a text-range.
370    * We may create multiple blocks if one single well-formed Block cannot be
371    * created.
372    *
373    * @param textStart beginning plain-text offset
374    * @param textEnd beginning plain-text offset
375    * @param minNode the smallest node number
376    * @param maxNode the largest node number
377    * @return a list of 0 or more Block objects, never null
378    */
createBlocks(int textStart, int textEnd, int minNode, int maxNode)379   public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) {
380 
381     ArrayList<Block> blocks = new ArrayList<Block>();
382     int startNode = Math.max(getBlockStart(textStart), minNode);
383     int endNode = Math.min(getBlockEnd(textEnd), maxNode);
384 
385     if (DEBUG) {
386       debug("Creating block: " +
387             "text pos: " + textStart + "-" + textEnd + "\n" +
388             "node pos: " + startNode + "-" + endNode + "\n" +
389             plainText.substring(textStart, textEnd));
390     }
391 
392     // Split up the block [start, end) into one or more blocks that
393     // are well-formed, and begins at a "line" boundary.
394     int blockStart = -1;
395     for (int n = startNode; n < endNode;) {
396 
397       // The node n spans [nBegin, nEnd]
398       int nBegin = begins.get(n);
399       int nEnd = ends.get(n);
400 
401       if (blockStart == -1) {
402         // Check if this is a valid start node
403         if (nBegin >= n && nEnd <= endNode &&
404             canBeginBlockAt(n)) {
405           blockStart = n;
406           n = nEnd + 1;
407         } else {
408           n++;
409         }
410         continue;
411       }
412 
413       // If the node [nBegin, nEnd) lies completely within
414       // the region then proceed to the (nEnd + 1).
415       if (nBegin >= blockStart && nEnd < endNode) {
416         n = nEnd + 1;
417         continue;
418       }
419 
420       // If we got here, we have to break up the region into one
421       // or more blocks because the current node cannot be included
422       // in the region.
423       if (DEBUG) {
424         debug("Forcing new block: " + n + " ("  + nBegin + " " + nEnd +
425               ") exceeds (" + blockStart + " " + endNode + ")");
426       }
427       Block b = new Block();
428       b.start_node = blockStart;
429       b.end_node = n;
430       blocks.add(b);
431 
432       blockStart = -1;
433       n++;
434     }
435 
436     // Last block
437     if (blockStart != -1) {
438       Block b = new Block();
439       b.start_node = blockStart;
440       b.end_node = endNode;
441       blocks.add(b);
442     }
443 
444     if (DEBUG) {
445       for (int i = 0; i < blocks.size(); i++) {
446         Block b = blocks.get(i);
447         debug("Block " + i + "/" + blocks.size() + ": " +
448               b.start_node + "-" + b.end_node + " " +
449               getPlainText(b.start_node, b.end_node));
450       }
451     }
452 
453     return blocks;
454   }
455 
456   /**
457    * Checks if a block can begin starting from a node position
458    */
canBeginBlockAt(int nodePos)459   private boolean canBeginBlockAt(int nodePos) {
460     int textPos = textPositions[nodePos];
461 
462     // Make sure that we don't exceed the text position, this happens
463     // for the last tag nodes.
464     if (textPos == plainText.length()) {
465       textPos--;
466     }
467 
468     // Scan backwards to check if a nodePos is at the beginning
469     // of a line.
470     for (int i = textPos; i > 0; i--) {
471       char ch = plainText.charAt(i);
472       if (ch == '\n') {
473         return true;
474       }
475       if (i < textPos && !HTML_WHITESPACE.matches(ch)) {
476         return false;
477       }
478     }
479     return true;
480   }
481 
482   /**
483    * Returns the start of a block given a text-pos
484    */
getBlockStart(int textPos)485   private int getBlockStart(int textPos) {
486     int nodenum = Arrays.binarySearch(textPositions, textPos);
487     if (nodenum >= 0) {
488       // Got an exact node alignment. Get the outer most pos that
489       // matches the text position
490       while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) {
491         nodenum--;
492       }
493     } else {
494       // textPos matches the middle of a node.
495       nodenum = -nodenum - 1;
496     }
497 
498     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
499     return nodenum;
500   }
501 
502   /**
503    * Returns the end of a block given a text-pos
504    */
getBlockEnd(int textPos)505   private int getBlockEnd(int textPos) {
506     int nodenum = Arrays.binarySearch(textPositions, textPos);
507     if (nodenum >= 0) {
508       // Got an exact node alignment.
509       while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) {
510         nodenum++;
511       }
512     } else {
513       // textPos matches the middle of a node.
514       nodenum = -nodenum - 2;
515     }
516     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
517     return nodenum;
518   }
519 
520   //------------------------------------------------------------------------
521   // Plain text view of the html tree
522   //------------------------------------------------------------------------
523   /**
524    * @return the plain-text position corresponding to the node
525    */
getTextPosition(int node)526   public int getTextPosition(int node) {
527     return textPositions[node];
528   }
529 
530   /**
531    * @return a plain-text String of the html tree
532    */
getPlainText()533   public String getPlainText() {
534     if (plainText == null) {
535       convertToPlainText();
536     }
537     return plainText;
538   }
539 
540   /**
541    * @return a plain-text String of a part of the html tree
542    */
getPlainText(int fromNode, int toNode)543   public String getPlainText(int fromNode, int toNode) {
544     if (plainText == null) {
545       convertToPlainText();
546     }
547     int textstart = textPositions[fromNode];
548     int textend = textPositions[toNode];
549     return plainText.substring(textstart, textend);
550   }
551 
552   /**
553    * Converts the html tree to plain text.
554    * We simply iterate through the nodes in the tree.
555    * As we output the plain-text, we keep track of the text position
556    * of each node.
557    * For String nodes, we replace '\n' with ' ' unless we're in a
558    * <pre> block.
559    */
convertToPlainText()560   private void convertToPlainText() {
561     X.assertTrue(plainText == null && textPositions == null);
562 
563     int numNodes = nodes.size();
564 
565     // Keeps track of start text position of each node, including a last
566     // entry for the size of the text.
567     textPositions = new int[numNodes + 1];
568 
569     Converter<String> converter = (Converter<String>) converterFactory.createInstance();
570 
571     for (int i = 0; i < numNodes; i++) {
572       textPositions[i] = converter.getPlainTextLength();
573       converter.addNode(nodes.get(i), i, ends.get(i));
574     }
575 
576     // Add a last entry, so that textPositions_[nodes_.size()] is valid.
577     textPositions[numNodes] = converter.getPlainTextLength();
578 
579     plainText = converter.getObject();
580 
581     if (DEBUG) {
582       debug("Plain text: " + plainText);
583 
584       for (int i = 0; i < nodes.size(); i++) {
585         int textPos = textPositions[i];
586         String text = plainText.substring(textPos, textPositions[i + 1]);
587         debug("At " + i + ": pos=" + textPos + " " +  text);
588       }
589     }
590   }
591 
592     //------------------------------------------------------------------------
593     // Spanned view of the html tree
594     //------------------------------------------------------------------------
595     /**
596      * @return a Spanned representation of the html tree
597      */
getSpanned()598     public Spanned getSpanned() {
599         if (constructedSpan == null) {
600             convertToSpan();
601         }
602         return constructedSpan;
603     }
604 
605     /**
606      * Converts the html tree to plain text.
607      * We simply iterate through the nodes in the tree.
608      * As we output the plain-text, we keep track of the text position
609      * of each node.
610      * For String nodes, we replace '\n' with ' ' unless we're in a
611      * <pre> block.
612      */
convertToSpan()613     private void convertToSpan() {
614         X.assertTrue(constructedSpan == null);
615 
616         int numNodes = nodes.size();
617 
618         Converter<Spanned> converter = (Converter<Spanned>) converterFactory.createInstance();
619 
620         for (int i = 0; i < numNodes; i++) {
621             converter.addNode(nodes.get(i), i, ends.get(i));
622         }
623 
624         constructedSpan = converter.getObject();
625     }
626 
627   /**
628    * Encapsulates the logic for outputting plain text with respect to text
629    * segments, white space separators, line breaks, and quote marks.
630    */
631   @VisibleForTesting
632   static final class PlainTextPrinter {
633     /**
634      * Separators are whitespace inserted between segments of text. The
635      * semantics are such that between any two segments of text, there is
636      * at most one separator. As such, separators are ordered in increasing
637      * priority, and setting a separator multiple times between text will
638      * result in the single separator with the highest priority being used.
639      * For example, a LineBreak (one newline) will override a Space, but will
640      * be overriden by a BlankLine (two newlines).
641      */
642     static enum Separator {
643       // The values here must be ordered by increasing priority, as the
644       // enum's ordinal() method is used when determining if a new separator
645       // should override an existing one.
646       None,
647       Space,      // single space
648       LineBreak,  // single new line
649       BlankLine   // two new lines
650     }
651 
652     // White space characters that are collapsed as a single space.
653     // Note that characters such as the non-breaking whitespace
654     // and full-width spaces are not equivalent to the normal spaces.
655     private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f";
656 
657     /**
658      * Determines if the given character is considered an HTML space character.
659      * Consecutive HTML space characters are collapsed into a single space when
660      * not within a PRE element.
661      */
isHtmlWhiteSpace(char ch)662     private static boolean isHtmlWhiteSpace(char ch) {
663       return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0;
664     }
665 
666     // The buffer in which we accumulate the converted plain text
667     private final StringBuilder sb = new StringBuilder();
668 
669     // How many <blockquote> blocks we are in.
670     private int quoteDepth = 0;
671 
672     // How many logical newlines are at the end of the buffer we've outputted.
673     // Note that we can't simply count the newlines at the end of the output
674     // buffer because a logical new line may be followed by quote marks.
675     //
676     // We initialize the value to 2 so that we consume any initial separators,
677     // since we don't need separators at the beginning of the output. This also
678     // results in correctly outputting any quote marks at the beginning of the
679     // output if the first piece of text is within a BLOCKQUOTE element.
680     private int endingNewLines = 2;
681 
682     // The next separator to be inserted between two text nodes.
683     private Separator separator = Separator.None;
684 
685     /** Returns the current length of the text. */
getTextLength()686     final int getTextLength() {
687       return sb.length();
688     }
689 
690     /** Returns the current text. */
getText()691     final String getText() {
692       return sb.toString();
693     }
694 
695     /**
696      * Sets the next separator between two text nodes. A Space separator is
697      * used if there is any whitespace between the two text nodes when there is
698      * no intervening element that breaks flow. This is automatically handled
699      * by the {@link #appendNormalText} function so the client never needs to
700      * specify this separator.
701      * <p>
702      * A LineBreak separator (single new line) is used if text segments are
703      * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR,
704      * etc.). The client should set this separator for opening and closing tags
705      * of any element that breaks flow.
706      * <p>
707      * A BlankLine separator (two new lines) should be set for opening and
708      * closing P tags.
709      * <p>
710      * If this method is called multiple times between text nodes, a
711      * separator with a higher priority will override that of a lower priority.
712      */
setSeparator(Separator newSeparator)713     final void setSeparator(Separator newSeparator) {
714       if (newSeparator.ordinal() > separator.ordinal()) {
715         separator = newSeparator;
716       }
717     }
718 
719     /** Increments the current quote depth of the text. */
incQuoteDepth()720     final void incQuoteDepth() {
721       quoteDepth++;
722     }
723 
724     /** Decrements the current quote depth of the text. */
decQuoteDepth()725     final void decQuoteDepth() {
726       quoteDepth = Math.max(0, quoteDepth - 1);
727     }
728 
729     /**
730      * Normalizes the HTML whitespace in the given {@code text} and appends it
731      * as the next segment of text. This will flush any separator that should
732      * be appended before the text, as well as any quote marks that should
733      * follow the last newline if the quote depth is non-zero.
734      */
appendNormalText(String text)735     final void appendNormalText(String text) {
736       if (text.length() == 0) {
737         return;
738       }
739       boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0));
740       boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1));
741 
742       // Strip beginning and ending whitespace.
743       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text);
744 
745       // Collapse whitespace within the text.
746       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' ');
747 
748       if (startsWithSpace) {
749         setSeparator(Separator.Space);
750       }
751 
752       appendTextDirect(text);
753 
754       if (endsWithSpace) {
755         setSeparator(Separator.Space);
756       }
757     }
758 
759     /**
760      * Appends the given text, preserving all whitespace. This is used for
761      * appending text in a PRE element.
762      */
appendPreText(String text)763     final void appendPreText(String text) {
764       // We're in a <pre> block. Split the text into lines, and append
765       // each line with appendTextDirect() to preserve white space.
766       String[] lines = text.split("[\\r\\n]", -1);
767 
768       // split() will always return an array with at least one element.
769       appendTextDirect(lines[0]);
770 
771       // For all of the remaining lines, we append a newline first, which
772       // takes care of any quote marks that we need to output if the quote
773       // depth is non-zero.
774       for (int i = 1; i < lines.length; i++) {
775         appendNewLine();
776         appendTextDirect(lines[i]);
777       }
778     }
779 
780     /**
781      * Appends the {@code text} directly to the output, taking into account
782      * any separator that should be appended before it, and any quote marks
783      * that should follow the last newline if the quote depth is non-zero.
784      * <p>
785      * {@code text} must not contain any new lines--in order to handle
786      * quoting correctly, it is up to the caller to either normalize away the
787      * newlines, or split the text up into separate lines and handle new lines
788      * with the {@link #appendNewLine} method.
789      * <p>
790      * The original {@code text} is not modified in any way. Use this method
791      * when you need to preserve the original white space.
792      * <p>
793      * If the given {@code text} is non empty, this method will result in
794      * {@code endingNewLines} being reset to 0.
795      */
appendTextDirect(String text)796     private void appendTextDirect(String text) {
797       if (text.length() == 0) {
798         return;
799       }
800       Preconditions.checkArgument(text.indexOf('\n') < 0,
801                                   "text must not contain newlines.");
802       flushSeparator();
803       maybeAddQuoteMarks(true);
804       sb.append(text);
805       endingNewLines = 0;
806     }
807 
808     /**
809      * Appends a forced line break, which is the equivalent of a BR element.
810      */
811     final void appendForcedLineBreak() {
812       flushSeparator();
813       appendNewLine();
814     }
815 
816     /**
817      * Appends any pending separator to the output buffer. This should be
818      * called before appending text to the buffer.
819      */
820     private void flushSeparator() {
821       switch (separator) {
822         case Space:
823           if (endingNewLines == 0) {
824             // Only append a space separator if we are not following a new
825             // line character. For example, we don't append a separator
826             // space after a <br> tag, since the <br>'s newline fulfills the
827             // space separation requirement.
828             sb.append(" ");
829           }
830           break;
831         case LineBreak:
832           while (endingNewLines < 1) {
833             appendNewLine();
834           }
835           break;
836         case BlankLine:
837           while (endingNewLines < 2) {
838             appendNewLine();
839           }
840           break;
841       }
842       separator = Separator.None;
843     }
844 
845     /**
846      * Adds a newline to the output. This handles any quote marks that should
847      * follow any previous new lines, and increments {@code endingNewLines}.
848      */
849     private void appendNewLine() {
850       maybeAddQuoteMarks(false);
851       sb.append('\n');
852       endingNewLines++;
853     }
854 
855     /**
856      * Adds quote marks to the output if we are at the beginning of a line.
857      * One '>' character is used for every level of quoting we are in.
858      *
859      * @param includeEndingSpace Includes a single space after the quote marks.
860      */
861     private void maybeAddQuoteMarks(boolean includeEndingSpace) {
862       // We only need to add quote marks if we are at the beginning of line.
863       if (endingNewLines > 0 && quoteDepth > 0) {
864         for (int i = 0; i < quoteDepth; i++) {
865           sb.append('>');
866         }
867         if (includeEndingSpace) {
868           sb.append(' ');
869         }
870       }
871     }
872   }
873 
874   /**
875    * Contains the logic for converting the contents of one HtmlTree into
876    * plaintext.
877    */
878   public static class DefaultPlainTextConverter implements Converter<String> {
879 
880     private static final Set<HTML.Element> BLANK_LINE_ELEMENTS =
881         ImmutableSet.of(
882             HTML4.P_ELEMENT,
883             HTML4.BLOCKQUOTE_ELEMENT,
884             HTML4.PRE_ELEMENT);
885 
886     private final PlainTextPrinter printer = new PlainTextPrinter();
887 
888     private int preDepth = 0;
889     private int styleDepth = 0;
890 
891     @Override
892     public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) {
893       if (n instanceof HtmlDocument.Text) {        // A string node
894 
895         HtmlDocument.Text textNode = (HtmlDocument.Text) n;
896         String str = textNode.getText();
897 
898         if (preDepth > 0) {
899           printer.appendPreText(str);
900 
901         } else if (styleDepth > 0) {
902           // Append nothing
903         } else {
904           printer.appendNormalText(str);
905         }
906 
907       } else if (n instanceof HtmlDocument.Tag) {
908 
909         // Check for linebreaking tags.
910         HtmlDocument.Tag tag = (HtmlDocument.Tag) n;
911         HTML.Element element = tag.getElement();
912 
913         if (BLANK_LINE_ELEMENTS.contains(element)) {
914           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
915 
916         } else if (HTML4.BR_ELEMENT.equals(element)) {
917           // The <BR> element is special in that it always adds a newline.
printer.appendForcedLineBreak()918           printer.appendForcedLineBreak();
919 
920         } else if (element.breaksFlow()) {
921           // All other elements that break the flow add a LineBreak separator.
922           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
923 
924           if (HTML4.HR_ELEMENT.equals(element)) {
925             printer.appendNormalText("________________________________");
926             printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
927           }
928         }
929 
930         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
printer.incQuoteDepth()931           printer.incQuoteDepth();
932 
933         } else if (HTML4.PRE_ELEMENT.equals(element)) {
934           preDepth++;
935         } else if (HTML4.STYLE_ELEMENT.equals(element)) {
936           styleDepth++;
937         }
938 
939       } else if (n instanceof HtmlDocument.EndTag) {
940 
941         // Check for linebreaking tags.
942         HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n;
943         HTML.Element element = endTag.getElement();
944 
945         if (BLANK_LINE_ELEMENTS.contains(element)) {
946           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
947 
948         } else if (element.breaksFlow()) {
949           // All other elements that break the flow add a LineBreak separator.
950           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
951         }
952 
953         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
printer.decQuoteDepth()954           printer.decQuoteDepth();
955 
956         } else if (HTML4.PRE_ELEMENT.equals(element)) {
957           preDepth--;
958         } else if (HTML4.STYLE_ELEMENT.equals(element)) {
959           styleDepth--;
960         }
961       }
962     }
963 
964     @Override
getPlainTextLength()965     public final int getPlainTextLength() {
966       return printer.getTextLength();
967     }
968 
969     @Override
getObject()970     public final String getObject() {
971       return printer.getText();
972     }
973   }
974 
975   //------------------------------------------------------------------------
976   // The following methods are used to build the html tree.
977   //------------------------------------------------------------------------
978   /** For building the html tree */
979   private Stack<Integer> stack;
980   private int parent;
981 
982   /** Starts the build process */
start()983   void start() {
984     stack = new Stack<Integer>();
985     parent = -1;
986   }
987 
988   /** Finishes the build process */
finish()989   void finish() {
990     X.assertTrue(stack.size() == 0);
991     X.assertTrue(parent == -1);
992   }
993 
994   /**
995    * Adds a html start tag, there must followed later by a call to addEndTag()
996    * to add the matching end tag
997    */
addStartTag(HtmlDocument.Tag t)998   void addStartTag(HtmlDocument.Tag t) {
999     int nodenum = nodes.size();
1000     addNode(t, nodenum, -1);
1001 
1002     stack.add(parent);
1003     parent = nodenum;
1004   }
1005 
1006   /**
1007    * Adds a html end tag, this must be preceded by a previous matching open tag
1008    */
addEndTag(HtmlDocument.EndTag t)1009   void addEndTag(HtmlDocument.EndTag t) {
1010     int nodenum = nodes.size();
1011     addNode(t, parent, nodenum);
1012 
1013     if (parent != -1) {
1014       ends.set(parent, nodenum);
1015     }
1016 
1017     parent = stack.pop();
1018   }
1019 
1020   /** Adds a singular tag that does not have a corresponding end tag */
addSingularTag(HtmlDocument.Tag t)1021   void addSingularTag(HtmlDocument.Tag t) {
1022     int nodenum = nodes.size();
1023     addNode(t, nodenum, nodenum);
1024   }
1025 
1026   /**
1027    * Adds a text
1028    * @param t a plain-text string
1029    */
addText(HtmlDocument.Text t)1030   void addText(HtmlDocument.Text t) {
1031     int nodenum = nodes.size();
1032     addNode(t, nodenum, nodenum);
1033   }
1034 
1035   /** Adds a node */
addNode(HtmlDocument.Node n, int begin, int end)1036   private void addNode(HtmlDocument.Node n, int begin, int end) {
1037     nodes.add(n);
1038     begins.add(begin);
1039     ends.add(end);
1040   }
1041 
1042   /** For debugging */
debug(String str)1043   private static final void debug(String str) {
1044     logger.finest(str);
1045   }
1046 
1047 }