• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2004, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.android.mail.lib.html.parser;
17 
18 import com.android.mail.lib.base.CharMatcher;
19 import com.android.mail.lib.base.Preconditions;
20 import com.android.mail.lib.base.X;
21 import com.google.common.collect.ImmutableSet;
22 
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Collections;
26 import java.util.List;
27 import java.util.Set;
28 import java.util.Stack;
29 import java.util.logging.Logger;
30 
31 /**
32  * HtmlTree represents a parsed and well-formed html text, it provides
33  * methods to convert to plain text. It also provides methods to find
34  * well-formed blocks of text, for quote detection.
35  *
36  * We don't really build a html tree data structure. Instead, for
37  * efficiency, and for the ability to do a simple in-order-traversal
38  * of the tree, we simply keeps a linear list of nodes (in-order).
39  * The begin_ and end_ arrays keeps track of the starting end ending
40  * nodes:
41  *
42  * For a string node, begin_[node] = end_[node] = node
43  * For an open tag, begin_[node] = node, end_[node] = the matching end tag
44  * For a close tag, end_[node] = the matching open tag, end_[node] = node
45  *
46  * @author jlim@google.com (Jing Yee Lim)
47  */
48 public class HtmlTree {
49 
50   /**
51    * An interface that allows clients to provide their own implementation
52    * of a {@link PlainTextConverter}.
53    */
54   public static interface PlainTextConverterFactory {
55     /**
56      * Creates a new instance of a {@link PlainTextConverter} to convert
57      * the contents of an {@link HtmlTree} to plain text.
58      */
createInstance()59     PlainTextConverter createInstance();
60   }
61 
62   /**
63    * An interface for an object which converts a single HtmlTree into
64    * plaintext.
65    */
66   public static interface PlainTextConverter {
67     /**
68      * Adds the given node {@code n} to plain text.
69      *
70      * @param n The node to convert to text.
71      * @param nodeNum The number of the node among the list of all notes.
72      * @param endNum The number of the ending node if this is a start node,
73      *    otherwise the same as {@code nodeNum}.
74      */
addNode(HtmlDocument.Node n, int nodeNum, int endNum)75     void addNode(HtmlDocument.Node n, int nodeNum, int endNum);
76 
77     /**
78      * Returns the current length of the plain text.
79      */
getPlainTextLength()80     int getPlainTextLength();
81 
82     /**
83      * Returns the current plain text.
84      */
getPlainText()85     String getPlainText();
86   }
87 
88   /** A factory that produces converters of the default implementation. */
89   private static final PlainTextConverterFactory DEFAULT_CONVERTER_FACTORY =
90       new PlainTextConverterFactory() {
91         public PlainTextConverter createInstance() {
92           return new DefaultPlainTextConverter();
93         }
94       };
95 
96   /** Contains html nodes */
97   private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>();
98 
99   /** Keeps track of beginning and end of each node */
100   private final Stack<Integer> begins = new Stack<Integer>();
101   private final Stack<Integer> ends = new Stack<Integer>();
102 
103   /** Plain text (lazy creation) */
104   private String plainText;
105 
106   /** The html string (lazy creation) */
107   private String html;
108 
109   /** textPositions[node pos] = the text position */
110   private int[] textPositions;
111 
112   private PlainTextConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY;
113 
114   // For debugging only
115   private static final boolean DEBUG = false;
116 
117   private static final Logger logger = Logger.getLogger(HtmlTree.class.getName());
118 
119   //------------------------------------------------------------------------
120 
121   /** HtmlTree can only be constructed from this package */
HtmlTree()122   HtmlTree() {
123   }
124 
125   /**
126    * Sets a new {@link PlainTextConverterFactory} to be used to convert
127    * the contents of this tree to plaintext.
128    */
setPlainTextConverterFactory(PlainTextConverterFactory factory)129   public void setPlainTextConverterFactory(PlainTextConverterFactory factory) {
130     if (factory == null) {
131       throw new NullPointerException("factory must not be null");
132     }
133     converterFactory = factory;
134   }
135 
136   /**
137    * Gets the list of node objects. A node can be either a
138    * Tag, EngTag or a String object.
139    * @return the nodes of the tree
140    */
getNodesList()141   public List<HtmlDocument.Node> getNodesList() {
142     return Collections.unmodifiableList(nodes);
143   }
144 
145   /**
146    * @return number of nodes
147    */
getNumNodes()148   public int getNumNodes() {
149     return nodes.size();
150   }
151 
152   /**
153    * Gets the entire html.
154    */
getHtml()155   public String getHtml() {
156     return getHtml(-1);
157   }
158 
159   /**
160    * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the
161    * specified size.
162    */
getHtml(int wrapSize)163   public String getHtml(int wrapSize) {
164     if (html == null) {
165       html = getHtml(0, nodes.size(), wrapSize);
166     }
167     return html;
168   }
169 
170   /** Gets parts of the html */
getHtml(int fromNode, int toNode)171   public String getHtml(int fromNode, int toNode) {
172     return getHtml(fromNode, toNode, -1);
173   }
174 
175   /**
176    * Gets parts of the html, if wrapSize is > 0, it tries
177    * to do wrapping at the specified size.
178    */
getHtml(int fromNode, int toNode, int wrapSize)179   public String getHtml(int fromNode, int toNode, int wrapSize) {
180     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
181 
182     int estSize = (toNode - fromNode) * 10;
183     StringBuilder sb = new StringBuilder(estSize);
184     int lastWrapIndex = 0;      // used for wrapping
185     for (int n = fromNode; n < toNode; n++) {
186       HtmlDocument.Node node = nodes.get(n);
187       node.toHTML(sb);
188       // TODO: maybe we can be smarter about this and not add newlines
189       // within <pre> tags, unless the whole long line is encompassed
190       // by the <pre> tag.
191       if (wrapSize > 0) {
192         // We can only wrap if the last outputted node is an element that
193         // breaks the flow. Otherwise, we risk the possibility of inserting
194         // spaces where they shouldn't be.
195         if ((node instanceof HtmlDocument.Tag &&
196               ((HtmlDocument.Tag) node).getElement().breaksFlow()) ||
197             (node instanceof HtmlDocument.EndTag &&
198               ((HtmlDocument.EndTag) node).getElement().breaksFlow())) {
199           // Check to see if there is a newline in the most recent node's html.
200           int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n');
201           if (recentNewLine != -1) {
202             lastWrapIndex += recentNewLine;
203           }
204           // If the last index - last index of a newline is greater than
205           // wrapSize, add a newline.
206           if (((sb.length() - 1) - lastWrapIndex) > wrapSize) {
207             sb.append('\n');
208             lastWrapIndex = sb.length() - 1;
209           }
210         }
211       }
212     }
213 
214     return sb.toString();
215   }
216 
217   /**
218    * Convert a html region into chunks of html code, each containing
219    * roughly chunkSize characters.
220    */
getHtmlChunks(int fromNode, int toNode, int chunkSize)221   public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) {
222     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
223 
224     ArrayList<String> chunks = new ArrayList<String>();
225 
226     // Do a best effort attempt to not split apart certain elements (as of now,
227     // just the <textarea>). We cannot guarantee that they will not be split
228     // because the client may specify endpoint nodes that land in the middle
229     // of an element (although this shouldn't happen if the endpoints returned
230     // by createBlocks() are properly used).
231     int stack = 0;
232     boolean balanced = true;
233 
234     StringBuilder sb = new StringBuilder(chunkSize + 256);
235     for (int n = fromNode; n < toNode; n++) {
236       HtmlDocument.Node node = nodes.get(n);
237       node.toHTML(sb);
238 
239       if (node instanceof HtmlDocument.Tag) {
240         if (HTML4.TEXTAREA_ELEMENT.equals(
241             ((HtmlDocument.Tag)node).getElement())) {
242           stack++;
243         }
244       }
245       if (node instanceof HtmlDocument.EndTag) {
246         if (HTML4.TEXTAREA_ELEMENT.equals(
247             ((HtmlDocument.EndTag)node).getElement())) {
248           if (stack == 0) {
249             balanced = false;
250           } else {
251             stack--;
252           }
253         }
254       }
255 
256       if (stack == 0 && sb.length() >= chunkSize) {
257         chunks.add(sb.toString());
258         sb.setLength(0);
259       }
260     }
261 
262     // Don't forget the last chunk!
263     if (sb.length() > 0) {
264       chunks.add(sb.toString());
265     }
266 
267     // If the tree is not balanced (cut off in the middle of a node), log
268     // debug data. Clients should fix their code so that the endpoints from
269     // createBlocks() are properly used.
270     if (!balanced || stack != 0) {
271       StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n");
272       debug.append(getHtml());
273       debug.append("\nfromNode: ").append(fromNode);
274       debug.append("\ntoNode: ").append(toNode);
275       debug.append("\nNum nodes_: ").append(getNumNodes());
276       for (String chunk : chunks) {
277         debug.append("\nChunk:\n").append(chunk);
278       }
279       logger.severe(debug.toString());
280     }
281 
282     return chunks;
283   }
284 
285   /**
286    * Returns height (maximum length from root to a leaf) of the HTML tree.
287    * @return height of the HTML tree.
288    */
getTreeHeight()289   public int getTreeHeight() {
290     int currentHeight = 0;
291     int maxHeight = 0;
292 
293     for (int i = 0; i < nodes.size(); i++) {
294       HtmlDocument.Node node = nodes.get(i);
295       if (node instanceof HtmlDocument.Tag) {
296         currentHeight++;
297         if (currentHeight > maxHeight) {
298           maxHeight = currentHeight;
299         }
300         if (((HtmlDocument.Tag) node).getElement().isEmpty()) {
301           // Empty tags have no closing pair, so decrease counter here.
302           currentHeight--;
303         }
304       } else if (node instanceof HtmlDocument.EndTag) {
305         currentHeight--;
306       }
307     }
308 
309     // TODO(anatol): make this value cachable?
310     return maxHeight;
311   }
312 
313   //------------------------------------------------------------------------
314   // Creating well-formed blocks within the html tree.
315   //------------------------------------------------------------------------
316   /**
317    * A Block represents a region of a html tree that
318    * 1) is well-formed, i.e. for each node in the block, all its descendants
319    * are also contained in the block. So it's safe to wrap the region
320    * within a <table> or <div>, etc.
321    * 2) starts at the beginning of a "line", e.g. a <div>, a <br>.
322    */
323   public static class Block {
324     /* The starting node */
325     public int start_node;
326 
327     /* The ending node (non-inclusive to the block) */
328     public int end_node;
329   }
330 
331   /**
332    * Creates a list of Blocks, given a text-range.
333    * We may create multiple blocks if one single well-formed Block cannot be
334    * created.
335    *
336    * @param textStart beginning plain-text offset
337    * @param textEnd beginning plain-text offset
338    * @param minNode the smallest node number
339    * @param maxNode the largest node number
340    * @return a list of 0 or more Block objects, never null
341    */
createBlocks(int textStart, int textEnd, int minNode, int maxNode)342   public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) {
343 
344     ArrayList<Block> blocks = new ArrayList<Block>();
345     int startNode = Math.max(getBlockStart(textStart), minNode);
346     int endNode = Math.min(getBlockEnd(textEnd), maxNode);
347 
348     if (DEBUG) {
349       debug("Creating block: " +
350             "text pos: " + textStart + "-" + textEnd + "\n" +
351             "node pos: " + startNode + "-" + endNode + "\n" +
352             plainText.substring(textStart, textEnd));
353     }
354 
355     // Split up the block [start, end) into one or more blocks that
356     // are well-formed, and begins at a "line" boundary.
357     int blockStart = -1;
358     for (int n = startNode; n < endNode;) {
359 
360       // The node n spans [nBegin, nEnd]
361       int nBegin = begins.get(n);
362       int nEnd = ends.get(n);
363 
364       if (blockStart == -1) {
365         // Check if this is a valid start node
366         if (nBegin >= n && nEnd <= endNode &&
367             canBeginBlockAt(n)) {
368           blockStart = n;
369           n = nEnd + 1;
370         } else {
371           n++;
372         }
373         continue;
374       }
375 
376       // If the node [nBegin, nEnd) lies completely within
377       // the region then proceed to the (nEnd + 1).
378       if (nBegin >= blockStart && nEnd < endNode) {
379         n = nEnd + 1;
380         continue;
381       }
382 
383       // If we got here, we have to break up the region into one
384       // or more blocks because the current node cannot be included
385       // in the region.
386       if (DEBUG) {
387         debug("Forcing new block: " + n + " ("  + nBegin + " " + nEnd +
388               ") exceeds (" + blockStart + " " + endNode + ")");
389       }
390       Block b = new Block();
391       b.start_node = blockStart;
392       b.end_node = n;
393       blocks.add(b);
394 
395       blockStart = -1;
396       n++;
397     }
398 
399     // Last block
400     if (blockStart != -1) {
401       Block b = new Block();
402       b.start_node = blockStart;
403       b.end_node = endNode;
404       blocks.add(b);
405     }
406 
407     if (DEBUG) {
408       for (int i = 0; i < blocks.size(); i++) {
409         Block b = blocks.get(i);
410         debug("Block " + i + "/" + blocks.size() + ": " +
411               b.start_node + "-" + b.end_node + " " +
412               getPlainText(b.start_node, b.end_node));
413       }
414     }
415 
416     return blocks;
417   }
418 
419   /**
420    * Checks if a block can begin starting from a node position
421    */
canBeginBlockAt(int nodePos)422   private boolean canBeginBlockAt(int nodePos) {
423     int textPos = textPositions[nodePos];
424 
425     // Make sure that we don't exceed the text position, this happens
426     // for the last tag nodes.
427     if (textPos == plainText.length()) {
428       textPos--;
429     }
430 
431     // Scan backwards to check if a nodePos is at the beginning
432     // of a line.
433     for (int i = textPos; i > 0; i--) {
434       char ch = plainText.charAt(i);
435       if (ch == '\n') {
436         return true;
437       }
438       if (i < textPos && !Character.isWhitespace(ch)) {
439         return false;
440       }
441     }
442     return true;
443   }
444 
445   /**
446    * Returns the start of a block given a text-pos
447    */
getBlockStart(int textPos)448   private int getBlockStart(int textPos) {
449     int nodenum = Arrays.binarySearch(textPositions, textPos);
450     if (nodenum >= 0) {
451       // Got an exact node alignment. Get the outer most pos that
452       // matches the text position
453       while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) {
454         nodenum--;
455       }
456     } else {
457       // textPos matches the middle of a node.
458       nodenum = -nodenum - 1;
459     }
460 
461     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
462     return nodenum;
463   }
464 
465   /**
466    * Returns the end of a block given a text-pos
467    */
getBlockEnd(int textPos)468   private int getBlockEnd(int textPos) {
469     int nodenum = Arrays.binarySearch(textPositions, textPos);
470     if (nodenum >= 0) {
471       // Got an exact node alignment.
472       while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) {
473         nodenum++;
474       }
475     } else {
476       // textPos matches the middle of a node.
477       nodenum = -nodenum - 2;
478     }
479     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
480     return nodenum;
481   }
482 
483   //------------------------------------------------------------------------
484   // Plain text view of the html tree
485   //------------------------------------------------------------------------
486   /**
487    * @return the plain-text position corresponding to the node
488    */
getTextPosition(int node)489   public int getTextPosition(int node) {
490     return textPositions[node];
491   }
492 
493   /**
494    * @return a plain-text String of the html tree
495    */
getPlainText()496   public String getPlainText() {
497     if (plainText == null) {
498       convertToPlainText();
499     }
500     return plainText;
501   }
502 
503   /**
504    * @return a plain-text String of a part of the html tree
505    */
getPlainText(int fromNode, int toNode)506   public String getPlainText(int fromNode, int toNode) {
507     if (plainText == null) {
508       convertToPlainText();
509     }
510     int textstart = textPositions[fromNode];
511     int textend = textPositions[toNode];
512     return plainText.substring(textstart, textend);
513   }
514 
515   /**
516    * Converts the html tree to plain text.
517    * We simply iterate through the nodes in the tree.
518    * As we output the plain-text, we keep track of the text position
519    * of each node.
520    * For String nodes, we replace '\n' with ' ' unless we're in a
521    * <pre> block.
522    */
convertToPlainText()523   private void convertToPlainText() {
524     X.assertTrue(plainText == null && textPositions == null);
525 
526     int numNodes = nodes.size();
527 
528     // Keeps track of start text position of each node, including a last
529     // entry for the size of the text.
530     textPositions = new int[numNodes + 1];
531 
532     PlainTextConverter converter = converterFactory.createInstance();
533 
534     for (int i = 0; i < numNodes; i++) {
535       textPositions[i] = converter.getPlainTextLength();
536       converter.addNode(nodes.get(i), i, ends.get(i));
537     }
538 
539     // Add a last entry, so that textPositions_[nodes_.size()] is valid.
540     textPositions[numNodes] = converter.getPlainTextLength();
541 
542     plainText = converter.getPlainText();
543 
544     if (DEBUG) {
545       debug("Plain text: " + plainText);
546 
547       for (int i = 0; i < nodes.size(); i++) {
548         int textPos = textPositions[i];
549         String text = plainText.substring(textPos, textPositions[i + 1]);
550         debug("At " + i + ": pos=" + textPos + " " +  text);
551       }
552     }
553   }
554 
555   /**
556    * Encapsulates the logic for outputting plain text with respect to text
557    * segments, white space separators, line breaks, and quote marks.
558    */
559   static final class PlainTextPrinter {
560     /**
561      * Separators are whitespace inserted between segments of text. The
562      * semantics are such that between any two segments of text, there is
563      * at most one separator. As such, separators are ordered in increasing
564      * priority, and setting a separator multiple times between text will
565      * result in the single separator with the highest priority being used.
566      * For example, a LineBreak (one newline) will override a Space, but will
567      * be overriden by a BlankLine (two newlines).
568      */
569     static enum Separator {
570       // The values here must be ordered by increasing priority, as the
571       // enum's ordinal() method is used when determining if a new separator
572       // should override an existing one.
573       None,
574       Space,      // single space
575       LineBreak,  // single new line
576       BlankLine   // two new lines
577     }
578 
579     // White space characters that are collapsed as a single space.
580     // Note that characters such as the non-breaking whitespace
581     // and full-width spaces are not equivalent to the normal spaces.
582     private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f";
583 
584     /**
585      * Determines if the given character is considered an HTML space character.
586      * Consecutive HTML space characters are collapsed into a single space when
587      * not within a PRE element.
588      */
isHtmlWhiteSpace(char ch)589     private static boolean isHtmlWhiteSpace(char ch) {
590       return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0;
591     }
592 
593     // The buffer in which we accumulate the converted plain text
594     private final StringBuilder sb = new StringBuilder();
595 
596     // How many <blockquote> blocks we are in.
597     private int quoteDepth = 0;
598 
599     // How many logical newlines are at the end of the buffer we've outputted.
600     // Note that we can't simply count the newlines at the end of the output
601     // buffer because a logical new line may be followed by quote marks.
602     //
603     // We initialize the value to 2 so that we consume any initial separators,
604     // since we don't need separators at the beginning of the output. This also
605     // results in correctly outputting any quote marks at the beginning of the
606     // output if the first piece of text is within a BLOCKQUOTE element.
607     private int endingNewLines = 2;
608 
609     // The next separator to be inserted between two text nodes.
610     private Separator separator = Separator.None;
611 
612     /** Returns the current length of the text. */
getTextLength()613     final int getTextLength() {
614       return sb.length();
615     }
616 
617     /** Returns the current text. */
getText()618     final String getText() {
619       return sb.toString();
620     }
621 
622     /**
623      * Sets the next separator between two text nodes. A Space separator is
624      * used if there is any whitespace between the two text nodes when there is
625      * no intervening element that breaks flow. This is automatically handled
626      * by the {@link #appendNormalText} function so the client never needs to
627      * specify this separator.
628      * <p>
629      * A LineBreak separator (single new line) is used if text segments are
630      * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR,
631      * etc.). The client should set this separator for opening and closing tags
632      * of any element that breaks flow.
633      * <p>
634      * A BlankLine separator (two new lines) should be set for opening and
635      * closing P tags.
636      * <p>
637      * If this method is called multiple times between text nodes, a
638      * separator with a higher priority will override that of a lower priority.
639      */
setSeparator(Separator newSeparator)640     final void setSeparator(Separator newSeparator) {
641       if (newSeparator.ordinal() > separator.ordinal()) {
642         separator = newSeparator;
643       }
644     }
645 
646     /** Increments the current quote depth of the text. */
incQuoteDepth()647     final void incQuoteDepth() {
648       quoteDepth++;
649     }
650 
651     /** Decrements the current quote depth of the text. */
decQuoteDepth()652     final void decQuoteDepth() {
653       quoteDepth = Math.max(0, quoteDepth - 1);
654     }
655 
656     /**
657      * Normalizes the HTML whitespace in the given {@code text} and appends it
658      * as the next segment of text. This will flush any separator that should
659      * be appended before the text, as well as any quote marks that should
660      * follow the last newline if the quote depth is non-zero.
661      */
appendNormalText(String text)662     final void appendNormalText(String text) {
663       if (text.length() == 0) {
664         return;
665       }
666       boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0));
667       boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1));
668 
669       // Strip beginning and ending whitespace.
670       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text);
671 
672       // Collapse whitespace within the text.
673       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' ');
674 
675       if (startsWithSpace) {
676         setSeparator(Separator.Space);
677       }
678 
679       appendTextDirect(text);
680 
681       if (endsWithSpace) {
682         setSeparator(Separator.Space);
683       }
684     }
685 
686     /**
687      * Appends the given text, preserving all whitespace. This is used for
688      * appending text in a PRE element.
689      */
appendPreText(String text)690     final void appendPreText(String text) {
691       // We're in a <pre> block. Split the text into lines, and append
692       // each line with appendTextDirect() to preserve white space.
693       String[] lines = text.split("[\\r\\n]", -1);
694 
695       // split() will always return an array with at least one element.
696       appendTextDirect(lines[0]);
697 
698       // For all of the remaining lines, we append a newline first, which
699       // takes care of any quote marks that we need to output if the quote
700       // depth is non-zero.
701       for (int i = 1; i < lines.length; i++) {
702         appendNewLine();
703         appendTextDirect(lines[i]);
704       }
705     }
706 
707     /**
708      * Appends the {@code text} directly to the output, taking into account
709      * any separator that should be appended before it, and any quote marks
710      * that should follow the last newline if the quote depth is non-zero.
711      * <p>
712      * {@code text} must not contain any new lines--in order to handle
713      * quoting correctly, it is up to the caller to either normalize away the
714      * newlines, or split the text up into separate lines and handle new lines
715      * with the {@link #appendNewLine} method.
716      * <p>
717      * The original {@code text} is not modified in any way. Use this method
718      * when you need to preserve the original white space.
719      * <p>
720      * If the given {@code text} is non empty, this method will result in
721      * {@code endingNewLines} being reset to 0.
722      */
appendTextDirect(String text)723     private void appendTextDirect(String text) {
724       if (text.length() == 0) {
725         return;
726       }
727       Preconditions.checkArgument(text.indexOf('\n') < 0,
728                                   "text must not contain newlines.");
729       flushSeparator();
730       maybeAddQuoteMarks(true);
731       sb.append(text);
732       endingNewLines = 0;
733     }
734 
735     /**
736      * Appends a forced line break, which is the equivalent of a BR element.
737      */
738     final void appendForcedLineBreak() {
739       flushSeparator();
740       appendNewLine();
741     }
742 
743     /**
744      * Appends any pending separator to the output buffer. This should be
745      * called before appending text to the buffer.
746      */
747     private void flushSeparator() {
748       switch (separator) {
749         case Space:
750           if (endingNewLines == 0) {
751             // Only append a space separator if we are not following a new
752             // line character. For example, we don't append a separator
753             // space after a <br> tag, since the <br>'s newline fulfills the
754             // space separation requirement.
755             sb.append(" ");
756           }
757           break;
758         case LineBreak:
759           while (endingNewLines < 1) {
760             appendNewLine();
761           }
762           break;
763         case BlankLine:
764           while (endingNewLines < 2) {
765             appendNewLine();
766           }
767           break;
768       }
769       separator = Separator.None;
770     }
771 
772     /**
773      * Adds a newline to the output. This handles any quote marks that should
774      * follow any previous new lines, and increments {@code endingNewLines}.
775      */
776     private void appendNewLine() {
777       maybeAddQuoteMarks(false);
778       sb.append('\n');
779       endingNewLines++;
780     }
781 
782     /**
783      * Adds quote marks to the output if we are at the beginning of a line.
784      * One '>' character is used for every level of quoting we are in.
785      *
786      * @param includeEndingSpace Includes a single space after the quote marks.
787      */
788     private void maybeAddQuoteMarks(boolean includeEndingSpace) {
789       // We only need to add quote marks if we are at the beginning of line.
790       if (endingNewLines > 0 && quoteDepth > 0) {
791         for (int i = 0; i < quoteDepth; i++) {
792           sb.append('>');
793         }
794         if (includeEndingSpace) {
795           sb.append(' ');
796         }
797       }
798     }
799   }
800 
801   /**
802    * Contains the logic for converting the contents of one HtmlTree into
803    * plaintext.
804    */
805   public static class DefaultPlainTextConverter implements PlainTextConverter {
806 
807     private static final Set<HTML.Element> BLANK_LINE_ELEMENTS =
808         ImmutableSet.of(
809             HTML4.P_ELEMENT,
810             HTML4.BLOCKQUOTE_ELEMENT,
811             HTML4.PRE_ELEMENT);
812 
813     private final PlainTextPrinter printer = new PlainTextPrinter();
814 
815     private int preDepth = 0;
816 
817     public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) {
818       if (n instanceof HtmlDocument.Text) {        // A string node
819 
820         HtmlDocument.Text textNode = (HtmlDocument.Text) n;
821         String str = textNode.getText();
822 
823         if (preDepth > 0) {
824           printer.appendPreText(str);
825 
826         } else {
827           printer.appendNormalText(str);
828         }
829 
830       } else if (n instanceof HtmlDocument.Tag) {
831 
832         // Check for linebreaking tags.
833         HtmlDocument.Tag tag = (HtmlDocument.Tag) n;
834         HTML.Element element = tag.getElement();
835 
836         if (BLANK_LINE_ELEMENTS.contains(element)) {
837           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
838 
839         } else if (HTML4.BR_ELEMENT.equals(element)) {
840           // The <BR> element is special in that it always adds a newline.
841           printer.appendForcedLineBreak();
842 
843         } else if (element.breaksFlow()) {
844           // All other elements that break the flow add a LineBreak separator.
845           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
846 
847           if (HTML4.HR_ELEMENT.equals(element)) {
848             printer.appendNormalText("________________________________");
849             printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
850           }
851         }
852 
853         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
854           printer.incQuoteDepth();
855 
856         } else if (HTML4.PRE_ELEMENT.equals(element)) {
857           preDepth++;
858         }
859 
860       } else if (n instanceof HtmlDocument.EndTag) {
861 
862         // Check for linebreaking tags.
863         HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n;
864         HTML.Element element = endTag.getElement();
865 
866         if (BLANK_LINE_ELEMENTS.contains(element)) {
867           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
868 
869         } else if (element.breaksFlow()) {
870           // All other elements that break the flow add a LineBreak separator.
871           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
872         }
873 
874         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
875           printer.decQuoteDepth();
876 
877         } else if (HTML4.PRE_ELEMENT.equals(element)) {
878           preDepth--;
879         }
880       }
881     }
882 
883     public final int getPlainTextLength() {
884       return printer.getTextLength();
885     }
886 
887     public final String getPlainText() {
888       return printer.getText();
889     }
890   }
891 
892   //------------------------------------------------------------------------
893   // The following methods are used to build the html tree.
894   //------------------------------------------------------------------------
895   /** For building the html tree */
896   private Stack<Integer> stack;
897   private int parent;
898 
899   /** Starts the build process */
900   void start() {
901     stack = new Stack<Integer>();
902     parent = -1;
903   }
904 
905   /** Finishes the build process */
906   void finish() {
907     X.assertTrue(stack.size() == 0);
908     X.assertTrue(parent == -1);
909   }
910 
911   /**
912    * to add the matching end tag
913    */
914   void addStartTag(HtmlDocument.Tag t) {
915     int nodenum = nodes.size();
916     addNode(t, nodenum, -1);
917 
918     stack.add(parent);
919     parent = nodenum;
920   }
921 
922   /**
923    * Adds a html end tag, this must be preceded by a previous matching open tag
924    */
925   void addEndTag(HtmlDocument.EndTag t) {
926     int nodenum = nodes.size();
927     addNode(t, parent, nodenum);
928 
929     if (parent != -1) {
930       ends.set(parent, nodenum);
931     }
932 
933     //is this the right pop?
934     parent = stack.pop();
935   }
936 
937   /** Adds a singular tag that does not have a corresponding end tag */
938   void addSingularTag(HtmlDocument.Tag t) {
939     int nodenum = nodes.size();
940     addNode(t, nodenum, nodenum);
941   }
942 
943   /**
944    * Adds a text
945    * @param t a plain-text string
946    */
947   void addText(HtmlDocument.Text t) {
948     int nodenum = nodes.size();
949     addNode(t, nodenum, nodenum);
950   }
951 
952   /** Adds a node */
953   private void addNode(HtmlDocument.Node n, int begin, int end) {
954 
955     nodes.add(n);
956     begins.add(begin);
957     ends.add(end);
958   }
959 
960   /** For debugging */
961   private static final void debug(String str) {
962     logger.finest(str);
963   }
964 
965 }