1 /** 2 * Copyright (c) 2004, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.google.android.mail.common.html.parser; 17 18 import com.google.android.mail.common.base.X; 19 import com.google.android.mail.common.html.parser.HtmlDocument.EndTag; 20 import com.google.common.io.ByteStreams; 21 22 import java.io.IOException; 23 import java.util.ArrayList; 24 import java.util.List; 25 import java.util.logging.Level; 26 import java.util.logging.Logger; 27 28 /** 29 * HtmlTreeBuilder builds a well-formed HtmlTree. 30 * 31 * @see HtmlTree 32 * @author jlim@google.com (Jing Yee Lim) 33 */ 34 public class HtmlTreeBuilder implements HtmlDocument.Visitor { 35 36 private static final Logger logger = Logger.getLogger(HtmlTreeBuilder.class.getName()); 37 38 /** Stack contains HTML4.Element objects to keep track of unclosed tags */ 39 private final List<HTML.Element> stack = new ArrayList<HTML.Element>(); 40 private final TableFixer tableFixer = new TableFixer(); 41 private HtmlTree tree; 42 private boolean built = false; 43 44 /** Gets the built html tree */ getTree()45 public HtmlTree getTree() { 46 X.assertTrue(built); 47 return tree; 48 } 49 50 /** Implements HtmlDocument.Visitor.start */ start()51 public void start() { 52 tree = new HtmlTree(); 53 tree.start(); 54 } 55 56 /** Implements HtmlDocument.Visitor.finish */ finish()57 public void finish() { 58 // Close all tags 59 while (stack.size() > 0) { 60 addMissingEndTag(); 61 } 62 tableFixer.finish(); 63 tree.finish(); 64 65 built = true; 66 } 67 68 /** Implements HtmlDocument.Visitor.visitTag */ visitTag(HtmlDocument.Tag t)69 public void visitTag(HtmlDocument.Tag t) { 70 tableFixer.seeTag(t); 71 72 HTML.Element element = t.getElement(); 73 if (element.isEmpty()) { 74 tree.addSingularTag(t); 75 } else if (t.isSelfTerminating()) { 76 // Explicitly create a non-selfterminating open tag and add it to the tree 77 // and also immediately add the corresponding close tag. This is done 78 // so that the toHTML, toXHTML and toOriginalHTML of the tree's node list 79 // will be balanced consistently. 80 // Otherwise there is a possibility of "<span /></span>" for example, if 81 // the created tree is converted to string through toXHTML. 82 tree.addStartTag(HtmlDocument.createTag(element, 83 t.getAttributes(), t.getOriginalHtmlBeforeAttributes(), 84 t.getOriginalHtmlAfterAttributes())); 85 EndTag end = HtmlDocument.createEndTag(element); 86 tableFixer.seeEndTag(end); 87 tree.addEndTag(end); 88 } else { 89 tree.addStartTag(t); 90 push(element); // Track the open tags 91 } 92 } 93 94 /** Implements HtmlVisitor.visit */ visitEndTag(HtmlDocument.EndTag t)95 public void visitEndTag(HtmlDocument.EndTag t) { 96 97 // Here we pop back to the start tag 98 HTML.Element element = t.getElement(); 99 int pos = findStartTag(element); 100 if (pos >= 0) { 101 102 // Add missing end-tags if any 103 while (pos < stack.size() - 1) { 104 addMissingEndTag(); 105 } 106 107 pop(); 108 tableFixer.seeEndTag(t); 109 tree.addEndTag(t); 110 111 } else { 112 // Not found, ignore this end tag 113 logger.finest("Ignoring end tag: " + element.getName()); 114 } 115 } 116 117 /** Implements HtmlDocument.Visitor.visitText */ visitText(HtmlDocument.Text t)118 public void visitText(HtmlDocument.Text t) { 119 tableFixer.seeText(t); 120 tree.addText(t); 121 } 122 123 /** Implements HtmlDocument.Visitor.visitComment */ visitComment(HtmlDocument.Comment n)124 public void visitComment(HtmlDocument.Comment n) { 125 // ignore 126 } 127 128 /** Finds the start tag from the stack, returns -1 if not found */ findStartTag(HTML.Element element)129 private int findStartTag(HTML.Element element) { 130 for (int i = stack.size() - 1; i >= 0; i--) { 131 HTML.Element e = stack.get(i); 132 if (e == element) { 133 return i; 134 } 135 } 136 return -1; 137 } 138 139 /** 140 * Adds a close tag corresponding to a tag on the stack, if 141 * the tag needs a close tag. 142 */ addMissingEndTag()143 private void addMissingEndTag() { 144 HTML.Element element = pop(); 145 146 HtmlDocument.EndTag endTag = HtmlDocument.createEndTag(element); 147 tableFixer.seeEndTag(endTag); 148 tree.addEndTag(endTag); 149 } 150 151 /** Pushes a tag onto the stack */ push(HTML.Element element)152 private void push(HTML.Element element) { 153 stack.add(element); 154 } 155 156 /** Pops an elemnt from the stack */ pop()157 private HTML.Element pop() { 158 return stack.remove(stack.size() - 1); 159 } 160 161 /** 162 * The TableFixer makes sure that a <table> structure is more or less well 163 * formed. Note that it only ensures that data within the <table> tag doesn't 164 * "leak out" of the table. 165 * 166 * For instance, all the tags here are balanced with end tags. But the 167 * 'outside' text ends up leaking out of the table. 168 * <table><tr><td bgcolor=yellow> 169 * <table><table>inside</table><td>outside</td></table> 170 * </td></tr></table> 171 * 172 * The TableFixer makes sure that 173 * 1) Within a table:, text and other elements are enclosed within a TD. 174 * A TD tag is inserted where necessary. 175 * 2) All table structure tags are enclosed within a <table>. A TABLE tag 176 * is inserted where necessary. 177 * 178 * Note that the TableFixer only adds open tags, it doesn't add end tags. 179 * The HtmlTreeVerifier ensures that all open tags are properly matched 180 * up and closed. 181 * 182 * @author Jing Yee Lim (jlim@google.com) 183 */ 184 class TableFixer { 185 186 private int tables = 0; // table nesting level 187 188 // States within a <table> 189 static final int NULL = 0; 190 static final int IN_CELL = 1; // in a <td> or <th> tag 191 static final int IN_CAPTION = 2; // in a <caption> tag 192 193 private int state; 194 seeTag(HtmlDocument.Tag tag)195 void seeTag(HtmlDocument.Tag tag) { 196 HTML.Element element = tag.getElement(); 197 if (element.getType() == HTML.Element.TABLE_TYPE) { 198 199 if (HTML4.TABLE_ELEMENT.equals(element)) { 200 if (tables > 0) { 201 ensureCellState(); 202 } 203 tables++; 204 state = NULL; 205 206 } else { 207 // Make sure that we're in a table 208 ensureTableState(); 209 210 // In cell/caption? 211 if (HTML4.TD_ELEMENT.equals(element) || 212 HTML4.TH_ELEMENT.equals(element)) { 213 state = IN_CELL; 214 215 } else if (HTML4.CAPTION_ELEMENT.equals(element)) { 216 state = IN_CAPTION; 217 } 218 } 219 } else { 220 if (tables > 0) { 221 222 // Ok to have a form element outside a table cell. 223 // e.g. <TR><FORM><TD>... 224 if (!HTML4.FORM_ELEMENT.equals(element)) { 225 ensureCellState(); 226 } 227 } 228 } 229 } 230 seeEndTag(HtmlDocument.EndTag endTag)231 void seeEndTag(HtmlDocument.EndTag endTag) { 232 HTML.Element element= endTag.getElement(); 233 234 if (tables > 0 && element.getType() == HTML.Element.TABLE_TYPE) { 235 236 if (HTML4.TD_ELEMENT.equals(element) || 237 HTML4.TR_ELEMENT.equals(element) || 238 HTML4.TH_ELEMENT.equals(element)) { 239 // End of a cell 240 state = NULL; 241 242 } else if (HTML4.CAPTION_ELEMENT.equals(element)) { // End caption 243 state = NULL; 244 245 } else if (HTML4.TABLE_ELEMENT.equals(element)) { // End table 246 X.assertTrue(tables > 0); 247 tables--; 248 state = (tables > 0) ? IN_CELL : NULL; 249 } 250 } 251 } 252 seeText(HtmlDocument.Text textNode)253 void seeText(HtmlDocument.Text textNode) { 254 // If we're in a table, but not in a cell or caption, and the 255 // text is not whitespace, add a <TD> 256 if (tables > 0 && 257 state == NULL && 258 !textNode.isWhitespace()) { 259 ensureCellState(); 260 } 261 } 262 finish()263 void finish() { 264 X.assertTrue(tables == 0); 265 X.assertTrue(state == NULL); 266 } 267 268 // Ensure that we're within a TABLE ensureTableState()269 private void ensureTableState() { 270 if (tables == 0) { 271 push(HTML4.TABLE_ELEMENT); 272 273 HtmlDocument.Tag tableTag = 274 HtmlDocument.createTag(HTML4.TABLE_ELEMENT, null); 275 tree.addStartTag(tableTag); 276 277 tables++; 278 } 279 } 280 281 // Ensure that we're within a TD or TH cell ensureCellState()282 private void ensureCellState() { 283 if (state != IN_CELL) { 284 push(HTML4.TD_ELEMENT); 285 286 HtmlDocument.Tag tdTag = HtmlDocument.createTag(HTML4.TD_ELEMENT, null); 287 tree.addStartTag(tdTag); 288 289 state = IN_CELL; 290 } 291 } 292 } 293 294 /** For testing */ main(String[] args)295 public static void main(String[] args) throws IOException { 296 logger.setLevel(Level.FINEST); 297 298 String html = new String(ByteStreams.toByteArray(System.in)); 299 HtmlParser parser = new HtmlParser(); 300 HtmlDocument doc = parser.parse(html); 301 302 HtmlTreeBuilder builder = new HtmlTreeBuilder(); 303 doc.accept(builder); 304 String outputHtml = builder.getTree().getHtml(); 305 306 System.out.println(outputHtml); 307 } 308 }