1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.harmony.xml.parsers; 18 19 import java.io.IOException; 20 import java.util.StringTokenizer; 21 22 import javax.xml.parsers.DocumentBuilder; 23 24 import org.kxml2.io.KXmlParser; 25 import org.w3c.dom.Attr; 26 import org.w3c.dom.DOMImplementation; 27 import org.w3c.dom.Document; 28 import org.w3c.dom.Element; 29 import org.w3c.dom.Node; 30 import org.xml.sax.EntityResolver; 31 import org.xml.sax.ErrorHandler; 32 import org.xml.sax.InputSource; 33 import org.xml.sax.SAXException; 34 import org.xml.sax.SAXParseException; 35 import org.xml.sax.helpers.LocatorImpl; 36 import org.xmlpull.v1.XmlPullParser; 37 import org.xmlpull.v1.XmlPullParserException; 38 39 import org.apache.harmony.xml.dom.DOMImplementationImpl; 40 41 /** 42 * Provides a straightforward DocumentBuilder implementation based on 43 * XMLPull/KXML. The class is used internally only, thus only notable members 44 * that are not already in the abstract superclass are documented. Hope that's 45 * ok. 46 */ 47 class DocumentBuilderImpl extends DocumentBuilder { 48 49 private static DOMImplementation dom = DOMImplementationImpl.getInstance(); 50 51 private EntityResolver entityResolver; 52 53 private ErrorHandler errorHandler; 54 55 private boolean ignoreComments; 56 57 private boolean ignoreElementContentWhitespace; 58 59 private boolean namespaceAware; 60 DocumentBuilderImpl()61 DocumentBuilderImpl() { 62 // Do nothing. 63 } 64 65 @Override getDOMImplementation()66 public DOMImplementation getDOMImplementation() { 67 return dom; 68 } 69 70 /** 71 * Reflects whether this DocumentBuilder is configured to ignore comments. 72 * 73 * @return True if and only if comments are ignored. 74 */ isIgnoringComments()75 public boolean isIgnoringComments() { 76 return ignoreComments; 77 } 78 79 /** 80 * Reflects whether this DocumentBuilder is configured to ignore element 81 * content whitespace. 82 * 83 * @return True if and only if whitespace element content is ignored. 84 */ isIgnoringElementContentWhitespace()85 public boolean isIgnoringElementContentWhitespace() { 86 return ignoreElementContentWhitespace; 87 } 88 89 @Override isNamespaceAware()90 public boolean isNamespaceAware() { 91 return namespaceAware; 92 } 93 94 @Override isValidating()95 public boolean isValidating() { 96 return false; 97 } 98 99 @Override newDocument()100 public Document newDocument() { 101 return dom.createDocument(null, null, null); 102 } 103 104 @Override parse(InputSource source)105 public Document parse(InputSource source) throws SAXException, IOException { 106 if (source == null) { 107 throw new IllegalArgumentException(); 108 } 109 110 Document document = newDocument(); 111 112 try { 113 XmlPullParser parser = new KXmlParser(); 114 115 parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, 116 namespaceAware); 117 118 if (source.getByteStream() != null) { 119 parser.setInput(source.getByteStream(), source.getEncoding()); 120 } else if (source.getCharacterStream() != null) { 121 parser.setInput(source.getCharacterStream()); 122 } else { 123 // TODO Accept other sources as well? 124 throw new SAXParseException( 125 "InputSource needs either stream or reader", null); 126 } 127 128 if(parser.nextToken() == XmlPullParser.END_DOCUMENT) { 129 throw new SAXParseException( 130 "Unexpected end of document", null); 131 } 132 133 parse(parser, document, document, XmlPullParser.END_DOCUMENT); 134 135 parser.require(XmlPullParser.END_DOCUMENT, null, null); 136 } catch (XmlPullParserException ex) { 137 if(ex.getDetail() instanceof IOException) { 138 throw (IOException)ex.getDetail(); 139 } 140 if(ex.getDetail() instanceof RuntimeException) { 141 throw (RuntimeException)ex.getDetail(); 142 } 143 144 LocatorImpl locator = new LocatorImpl(); 145 146 locator.setPublicId(source.getPublicId()); 147 locator.setSystemId(source.getSystemId()); 148 locator.setLineNumber(ex.getLineNumber()); 149 locator.setColumnNumber(ex.getColumnNumber()); 150 151 SAXParseException newEx = new SAXParseException(ex.getMessage(), 152 locator); 153 154 if (errorHandler != null) { 155 errorHandler.error(newEx); 156 } 157 158 throw newEx; 159 } 160 161 return document; 162 } 163 164 /** 165 * Implements the whole parsing of the XML document. The XML pull parser is 166 * actually more of a tokenizer, and we are doing a classical recursive 167 * descent parsing (the method invokes itself for XML elements). Our 168 * approach to parsing does accept some illegal documents (more than one 169 * root element, for example). The assumption is that the DOM implementation 170 * throws the proper exceptions in these cases. 171 * 172 * @param parser The XML pull parser we're reading from. 173 * @param document The document we're building. 174 * @param node The node we're currently on (initially the document itself). 175 * @param endToken The token that will end this recursive call. Either 176 * XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG. 177 * 178 * @throws XmlPullParserException If a parsing error occurs. 179 * @throws IOException If a general IO error occurs. 180 */ parse(XmlPullParser parser, Document document, Node node, int endToken)181 private void parse(XmlPullParser parser, Document document, Node node, 182 int endToken) throws XmlPullParserException, IOException { 183 184 int token = parser.getEventType(); 185 186 /* 187 * The main parsing loop. The precondition is that we are already on the 188 * token to be processed. This holds for each iteration of the loop, so 189 * the inner statements have to ensure that (in particular the recursive 190 * call). 191 */ 192 while (token != endToken && token != XmlPullParser.END_DOCUMENT) { 193 if (token == XmlPullParser.PROCESSING_INSTRUCTION) { 194 /* 195 * Found a processing instructions. We need to split the token 196 * text at the first whitespace character. 197 */ 198 String text = parser.getText(); 199 200 int dot = text.indexOf(' '); 201 202 String target = (dot != -1 ? text.substring(0, dot) : text); 203 String data = (dot != -1 ? text.substring(dot + 1) : ""); 204 205 node.appendChild(document.createProcessingInstruction(target, 206 data)); 207 } else if (token == XmlPullParser.DOCDECL) { 208 /* 209 * Found a document type declaration. Unfortunately KXML doesn't 210 * have the necessary details. Do we parse it ourselves, or do 211 * we silently ignore it, since it isn't mandatory in DOM 2 212 * anyway? 213 */ 214 StringTokenizer tokenizer = new StringTokenizer(parser.getText()); 215 if (tokenizer.hasMoreTokens()) { 216 String name = tokenizer.nextToken(); 217 String pubid = null; 218 String sysid = null; 219 220 if (tokenizer.hasMoreTokens()) { 221 String text = tokenizer.nextToken(); 222 223 if ("SYSTEM".equals(text)) { 224 if (tokenizer.hasMoreTokens()) { 225 sysid = tokenizer.nextToken(); 226 } 227 } else if ("PUBLIC".equals(text)) { 228 if (tokenizer.hasMoreTokens()) { 229 pubid = tokenizer.nextToken(); 230 } 231 if (tokenizer.hasMoreTokens()) { 232 sysid = tokenizer.nextToken(); 233 } 234 } 235 } 236 237 if (pubid != null && pubid.length() >= 2 && pubid.startsWith("\"") && pubid.endsWith("\"")) { 238 pubid = pubid.substring(1, pubid.length() - 1); 239 } 240 241 if (sysid != null && sysid.length() >= 2 && sysid.startsWith("\"") && sysid.endsWith("\"")) { 242 sysid = sysid.substring(1, sysid.length() - 1); 243 } 244 245 document.appendChild(dom.createDocumentType(name, pubid, sysid)); 246 } 247 248 } else if (token == XmlPullParser.COMMENT) { 249 /* 250 * Found a comment. We simply take the token text, but we only 251 * create a node if the client wants to see comments at all. 252 */ 253 if (!ignoreComments) { 254 node.appendChild(document.createComment(parser.getText())); 255 } 256 } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) { 257 /* 258 * Found some ignorable whitespace. We simply take the token 259 * text, but we only create a node if the client wants to see 260 * whitespace at all. 261 */ 262 if (!ignoreElementContentWhitespace) { 263 node.appendChild(document.createTextNode(parser.getText())); 264 } 265 } else if (token == XmlPullParser.TEXT) { 266 /* 267 * Found a piece of text. That's the easiest case. We simply 268 * take it and create a corresponding node. 269 */ 270 node.appendChild(document.createTextNode(parser.getText())); 271 } else if (token == XmlPullParser.CDSECT) { 272 /* 273 * Found a CDATA section. That's also trivial. We simply 274 * take it and create a corresponding node. 275 */ 276 node.appendChild(document.createCDATASection(parser.getText())); 277 } else if (token == XmlPullParser.ENTITY_REF) { 278 /* 279 * Found an entity reference. If an entity resolver is 280 * installed, we replace it by text (if possible). Otherwise we 281 * add an entity reference node. 282 */ 283 String entity = parser.getName(); 284 285 if (entityResolver != null) { 286 // TODO Implement this... 287 } 288 289 String replacement = resolveStandardEntity(entity); 290 if (replacement != null) { 291 node.appendChild(document.createTextNode(replacement)); 292 } else { 293 node.appendChild(document.createEntityReference(entity)); 294 } 295 } else if (token == XmlPullParser.START_TAG) { 296 /* 297 * Found an element start tag. We create an element node with 298 * the proper info and attributes. We then invoke parse() 299 * recursively to handle the next level of nesting. When we 300 * return from this call, we check that we are on the proper 301 * element end tag. The whole handling differs somewhat 302 * depending on whether the parser is namespace-aware or not. 303 */ 304 if (namespaceAware) { 305 // Collect info for element node 306 String namespace = parser.getNamespace(); 307 String name = parser.getName(); 308 String prefix = parser.getPrefix(); 309 310 if ("".equals(namespace)) { 311 namespace = null; 312 } 313 314 // Create element node and wire it correctly 315 Element element = document.createElementNS(namespace, name); 316 element.setPrefix(prefix); 317 node.appendChild(element); 318 319 for (int i = 0; i < parser.getAttributeCount(); i++) { 320 // Collect info for a single attribute node 321 String attrNamespace = parser.getAttributeNamespace(i); 322 String attrPrefix = parser.getAttributePrefix(i); 323 String attrName = parser.getAttributeName(i); 324 String attrValue = parser.getAttributeValue(i); 325 326 if ("".equals(attrNamespace)) { 327 attrNamespace = null; 328 } 329 330 // Create attribute node and wire it correctly 331 Attr attr = document.createAttributeNS(attrNamespace, attrName); 332 attr.setPrefix(attrPrefix); 333 attr.setValue(attrValue); 334 element.setAttributeNodeNS(attr); 335 } 336 337 // Recursive descent 338 token = parser.nextToken(); 339 parse(parser, document, element, XmlPullParser.END_TAG); 340 341 // Expect the element's end tag here 342 parser.require(XmlPullParser.END_TAG, namespace, name); 343 344 } else { 345 // Collect info for element node 346 String name = parser.getName(); 347 348 // Create element node and wire it correctly 349 Element element = document.createElement(name); 350 node.appendChild(element); 351 352 for (int i = 0; i < parser.getAttributeCount(); i++) { 353 // Collect info for a single attribute node 354 String attrName = parser.getAttributeName(i); 355 String attrValue = parser.getAttributeValue(i); 356 357 // Create attribute node and wire it correctly 358 Attr attr = document.createAttribute(attrName); 359 attr.setValue(attrValue); 360 element.setAttributeNode(attr); 361 } 362 363 // Recursive descent 364 token = parser.nextToken(); 365 parse(parser, document, element, XmlPullParser.END_TAG); 366 367 // Expect the element's end tag here 368 parser.require(XmlPullParser.END_TAG, "", name); 369 } 370 } 371 372 token = parser.nextToken(); 373 } 374 } 375 376 @Override setEntityResolver(EntityResolver resolver)377 public void setEntityResolver(EntityResolver resolver) { 378 entityResolver = resolver; 379 } 380 381 @Override setErrorHandler(ErrorHandler handler)382 public void setErrorHandler(ErrorHandler handler) { 383 errorHandler = handler; 384 } 385 386 /** 387 * Controls whether this DocumentBuilder ignores comments. 388 * 389 * @param value Turns comment ignorance on or off. 390 */ setIgnoreComments(boolean value)391 public void setIgnoreComments(boolean value) { 392 ignoreComments = value; 393 } 394 395 /** 396 * Controls whether this DocumentBuilder ignores element content whitespace. 397 * 398 * @param value Turns element whitespace content ignorance on or off. 399 */ setIgnoreElementContentWhitespace(boolean value)400 public void setIgnoreElementContentWhitespace(boolean value) { 401 ignoreElementContentWhitespace = value; 402 } 403 404 /** 405 * Controls whether this DocumentBuilder is namespace-aware. 406 * 407 * @param value Turns namespace awareness on or off. 408 */ setNamespaceAware(boolean value)409 public void setNamespaceAware(boolean value) { 410 namespaceAware = value; 411 } 412 413 /** 414 * Resolves one of the five standard XML entities. 415 * 416 * @param entity The name of the entity to resolve, not including 417 * the ampersand or the semicolon. 418 * 419 * @return The proper replacement, or null, if the entity is unknown. 420 */ resolveStandardEntity(String entity)421 private String resolveStandardEntity(String entity) { 422 if ("lt".equals(entity)) { 423 return "<"; 424 } else if ("gt".equals(entity)) { 425 return ">"; 426 } else if ("amp".equals(entity)) { 427 return "&"; 428 } else if ("apos".equals(entity)) { 429 return "'"; 430 } else if ("quot".equals(entity)) { 431 return "\""; 432 } else { 433 return null; 434 } 435 } 436 } 437