1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.harmony.xml.parsers; 18 19 import java.io.IOException; 20 import java.net.URL; 21 import java.net.URLConnection; 22 import javax.xml.parsers.DocumentBuilder; 23 import libcore.io.IoUtils; 24 import org.apache.harmony.xml.dom.CDATASectionImpl; 25 import org.apache.harmony.xml.dom.DOMImplementationImpl; 26 import org.apache.harmony.xml.dom.DocumentImpl; 27 import org.apache.harmony.xml.dom.DocumentTypeImpl; 28 import org.apache.harmony.xml.dom.TextImpl; 29 import org.kxml2.io.KXmlParser; 30 import org.w3c.dom.Attr; 31 import org.w3c.dom.DOMImplementation; 32 import org.w3c.dom.Document; 33 import org.w3c.dom.DocumentType; 34 import org.w3c.dom.Element; 35 import org.w3c.dom.Node; 36 import org.w3c.dom.Text; 37 import org.xml.sax.EntityResolver; 38 import org.xml.sax.ErrorHandler; 39 import org.xml.sax.InputSource; 40 import org.xml.sax.SAXException; 41 import org.xml.sax.SAXParseException; 42 import org.xml.sax.helpers.LocatorImpl; 43 import org.xmlpull.v1.XmlPullParser; 44 import org.xmlpull.v1.XmlPullParserException; 45 46 /** 47 * Builds a DOM using KXmlParser. 48 */ 49 class DocumentBuilderImpl extends DocumentBuilder { 50 51 private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance(); 52 53 private boolean coalescing; 54 private EntityResolver entityResolver; 55 private ErrorHandler errorHandler; 56 private boolean ignoreComments; 57 private boolean ignoreElementContentWhitespace; 58 private boolean namespaceAware; 59 // adding a new field? don't forget to update reset(). 60 reset()61 @Override public void reset() { 62 coalescing = false; 63 entityResolver = null; 64 errorHandler = null; 65 ignoreComments = false; 66 ignoreElementContentWhitespace = false; 67 namespaceAware = false; 68 } 69 70 @Override getDOMImplementation()71 public DOMImplementation getDOMImplementation() { 72 return dom; 73 } 74 75 @Override isNamespaceAware()76 public boolean isNamespaceAware() { 77 return namespaceAware; 78 } 79 80 @Override isValidating()81 public boolean isValidating() { 82 return false; 83 } 84 85 @Override newDocument()86 public Document newDocument() { 87 return dom.createDocument(null, null, null); 88 } 89 90 @Override parse(InputSource source)91 public Document parse(InputSource source) throws SAXException, IOException { 92 if (source == null) { 93 throw new IllegalArgumentException("source == null"); 94 } 95 96 String namespaceURI = null; 97 String qualifiedName = null; 98 DocumentType doctype = null; 99 String inputEncoding = source.getEncoding(); 100 String systemId = source.getSystemId(); 101 DocumentImpl document = new DocumentImpl( 102 dom, namespaceURI, qualifiedName, doctype, inputEncoding); 103 document.setDocumentURI(systemId); 104 105 KXmlParser parser = new KXmlParser(); 106 try { 107 parser.keepNamespaceAttributes(); 108 parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware); 109 110 if (source.getByteStream() != null) { 111 parser.setInput(source.getByteStream(), inputEncoding); 112 } else if (source.getCharacterStream() != null) { 113 parser.setInput(source.getCharacterStream()); 114 } else if (systemId != null) { 115 URL url = new URL(systemId); 116 URLConnection urlConnection = url.openConnection(); 117 urlConnection.connect(); 118 // TODO: if null, extract the inputEncoding from the Content-Type header? 119 parser.setInput(urlConnection.getInputStream(), inputEncoding); 120 } else { 121 throw new SAXParseException("InputSource needs a stream, reader or URI", null); 122 } 123 124 if (parser.nextToken() == XmlPullParser.END_DOCUMENT) { 125 throw new SAXParseException("Unexpected end of document", null); 126 } 127 128 parse(parser, document, document, XmlPullParser.END_DOCUMENT); 129 130 parser.require(XmlPullParser.END_DOCUMENT, null, null); 131 } catch (XmlPullParserException ex) { 132 if (ex.getDetail() instanceof IOException) { 133 throw (IOException) ex.getDetail(); 134 } 135 if (ex.getDetail() instanceof RuntimeException) { 136 throw (RuntimeException) ex.getDetail(); 137 } 138 139 LocatorImpl locator = new LocatorImpl(); 140 141 locator.setPublicId(source.getPublicId()); 142 locator.setSystemId(systemId); 143 locator.setLineNumber(ex.getLineNumber()); 144 locator.setColumnNumber(ex.getColumnNumber()); 145 146 SAXParseException newEx = new SAXParseException(ex.getMessage(), locator); 147 148 if (errorHandler != null) { 149 errorHandler.error(newEx); 150 } 151 152 throw newEx; 153 } finally { 154 IoUtils.closeQuietly(parser); 155 } 156 157 return document; 158 } 159 160 /** 161 * Implements the whole parsing of the XML document. The XML pull parser is 162 * actually more of a tokenizer, and we are doing a classical recursive 163 * descent parsing (the method invokes itself for XML elements). Our 164 * approach to parsing does accept some illegal documents (more than one 165 * root element, for example). The assumption is that the DOM implementation 166 * throws the proper exceptions in these cases. 167 * 168 * @param parser The XML pull parser we're reading from. 169 * @param document The document we're building. 170 * @param node The node we're currently on (initially the document itself). 171 * @param endToken The token that will end this recursive call. Either 172 * XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG. 173 * 174 * @throws XmlPullParserException If a parsing error occurs. 175 * @throws IOException If a general IO error occurs. 176 */ parse(KXmlParser parser, DocumentImpl document, Node node, int endToken)177 private void parse(KXmlParser parser, DocumentImpl document, Node node, 178 int endToken) throws XmlPullParserException, IOException { 179 180 int token = parser.getEventType(); 181 182 /* 183 * The main parsing loop. The precondition is that we are already on the 184 * token to be processed. This holds for each iteration of the loop, so 185 * the inner statements have to ensure that (in particular the recursive 186 * call). 187 */ 188 while (token != endToken && token != XmlPullParser.END_DOCUMENT) { 189 if (token == XmlPullParser.PROCESSING_INSTRUCTION) { 190 /* 191 * Found a processing instructions. We need to split the token 192 * text at the first whitespace character. 193 */ 194 String text = parser.getText(); 195 196 int dot = text.indexOf(' '); 197 198 String target = (dot != -1 ? text.substring(0, dot) : text); 199 String data = (dot != -1 ? text.substring(dot + 1) : ""); 200 201 node.appendChild(document.createProcessingInstruction(target, 202 data)); 203 } else if (token == XmlPullParser.DOCDECL) { 204 String name = parser.getRootElementName(); 205 String publicId = parser.getPublicId(); 206 String systemId = parser.getSystemId(); 207 document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId)); 208 209 } else if (token == XmlPullParser.COMMENT) { 210 /* 211 * Found a comment. We simply take the token text, but we only 212 * create a node if the client wants to see comments at all. 213 */ 214 if (!ignoreComments) { 215 node.appendChild(document.createComment(parser.getText())); 216 } 217 } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) { 218 /* 219 * Found some ignorable whitespace. We only add it if the client 220 * wants to see whitespace. Whitespace before and after the 221 * document element is always ignored. 222 */ 223 if (!ignoreElementContentWhitespace && document != node) { 224 appendText(document, node, token, parser.getText()); 225 } 226 } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) { 227 /* 228 * Found a piece of text (possibly encoded as a CDATA section). 229 * That's the easiest case. We simply take it and create a new text node, 230 * or merge with an adjacent text node. 231 */ 232 appendText(document, node, token, parser.getText()); 233 } else if (token == XmlPullParser.ENTITY_REF) { 234 /* 235 * Found an entity reference. If an entity resolver is 236 * installed, we replace it by text (if possible). Otherwise we 237 * add an entity reference node. 238 */ 239 String entity = parser.getName(); 240 241 if (entityResolver != null) { 242 // TODO Implement this... 243 } 244 245 String resolved = resolvePredefinedOrCharacterEntity(entity); 246 if (resolved != null) { 247 appendText(document, node, token, resolved); 248 } else { 249 node.appendChild(document.createEntityReference(entity)); 250 } 251 } else if (token == XmlPullParser.START_TAG) { 252 /* 253 * Found an element start tag. We create an element node with 254 * the proper info and attributes. We then invoke parse() 255 * recursively to handle the next level of nesting. When we 256 * return from this call, we check that we are on the proper 257 * element end tag. The whole handling differs somewhat 258 * depending on whether the parser is namespace-aware or not. 259 */ 260 if (namespaceAware) { 261 // Collect info for element node 262 String namespace = parser.getNamespace(); 263 String name = parser.getName(); 264 String prefix = parser.getPrefix(); 265 266 if ("".equals(namespace)) { 267 namespace = null; 268 } 269 270 // Create element node and wire it correctly 271 Element element = document.createElementNS(namespace, name); 272 element.setPrefix(prefix); 273 node.appendChild(element); 274 275 for (int i = 0; i < parser.getAttributeCount(); i++) { 276 // Collect info for a single attribute node 277 String attrNamespace = parser.getAttributeNamespace(i); 278 String attrPrefix = parser.getAttributePrefix(i); 279 String attrName = parser.getAttributeName(i); 280 String attrValue = parser.getAttributeValue(i); 281 282 if ("".equals(attrNamespace)) { 283 attrNamespace = null; 284 } 285 286 // Create attribute node and wire it correctly 287 Attr attr = document.createAttributeNS(attrNamespace, attrName); 288 attr.setPrefix(attrPrefix); 289 attr.setValue(attrValue); 290 element.setAttributeNodeNS(attr); 291 } 292 293 // Recursive descent 294 token = parser.nextToken(); 295 parse(parser, document, element, XmlPullParser.END_TAG); 296 297 // Expect the element's end tag here 298 parser.require(XmlPullParser.END_TAG, namespace, name); 299 300 } else { 301 // Collect info for element node 302 String name = parser.getName(); 303 304 // Create element node and wire it correctly 305 Element element = document.createElement(name); 306 node.appendChild(element); 307 308 for (int i = 0; i < parser.getAttributeCount(); i++) { 309 // Collect info for a single attribute node 310 String attrName = parser.getAttributeName(i); 311 String attrValue = parser.getAttributeValue(i); 312 313 // Create attribute node and wire it correctly 314 Attr attr = document.createAttribute(attrName); 315 attr.setValue(attrValue); 316 element.setAttributeNode(attr); 317 } 318 319 // Recursive descent 320 token = parser.nextToken(); 321 parse(parser, document, element, XmlPullParser.END_TAG); 322 323 // Expect the element's end tag here 324 parser.require(XmlPullParser.END_TAG, "", name); 325 } 326 } 327 328 token = parser.nextToken(); 329 } 330 } 331 332 /** 333 * @param token the XML pull parser token type, such as XmlPullParser.CDSECT 334 * or XmlPullParser.ENTITY_REF. 335 */ appendText(DocumentImpl document, Node parent, int token, String text)336 private void appendText(DocumentImpl document, Node parent, int token, String text) { 337 // Ignore empty runs. 338 if (text.isEmpty()) { 339 return; 340 } 341 // Merge with any previous text node if possible. 342 if (coalescing || token != XmlPullParser.CDSECT) { 343 Node lastChild = parent.getLastChild(); 344 if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) { 345 Text textNode = (Text) lastChild; 346 textNode.appendData(text); 347 return; 348 } 349 } 350 // Okay, we really do need a new text node 351 parent.appendChild(token == XmlPullParser.CDSECT 352 ? new CDATASectionImpl(document, text) 353 : new TextImpl(document, text)); 354 } 355 356 @Override setEntityResolver(EntityResolver resolver)357 public void setEntityResolver(EntityResolver resolver) { 358 entityResolver = resolver; 359 } 360 361 @Override setErrorHandler(ErrorHandler handler)362 public void setErrorHandler(ErrorHandler handler) { 363 errorHandler = handler; 364 } 365 366 /** 367 * Controls whether this DocumentBuilder ignores comments. 368 */ setIgnoreComments(boolean value)369 public void setIgnoreComments(boolean value) { 370 ignoreComments = value; 371 } 372 setCoalescing(boolean value)373 public void setCoalescing(boolean value) { 374 coalescing = value; 375 } 376 377 /** 378 * Controls whether this DocumentBuilder ignores element content whitespace. 379 */ setIgnoreElementContentWhitespace(boolean value)380 public void setIgnoreElementContentWhitespace(boolean value) { 381 ignoreElementContentWhitespace = value; 382 } 383 384 /** 385 * Controls whether this DocumentBuilder is namespace-aware. 386 */ setNamespaceAware(boolean value)387 public void setNamespaceAware(boolean value) { 388 namespaceAware = value; 389 } 390 391 /** 392 * Returns the replacement text or null if {@code entity} isn't predefined. 393 */ resolvePredefinedOrCharacterEntity(String entityName)394 private String resolvePredefinedOrCharacterEntity(String entityName) { 395 // Character references, section 4.1 of the XML specification. 396 if (entityName.startsWith("#x")) { 397 return resolveCharacterReference(entityName.substring(2), 16); 398 } else if (entityName.startsWith("#")) { 399 return resolveCharacterReference(entityName.substring(1), 10); 400 } 401 // Predefined entities, section 4.6 of the XML specification. 402 if ("lt".equals(entityName)) { 403 return "<"; 404 } else if ("gt".equals(entityName)) { 405 return ">"; 406 } else if ("amp".equals(entityName)) { 407 return "&"; 408 } else if ("apos".equals(entityName)) { 409 return "'"; 410 } else if ("quot".equals(entityName)) { 411 return "\""; 412 } else { 413 return null; 414 } 415 } 416 resolveCharacterReference(String value, int base)417 private String resolveCharacterReference(String value, int base) { 418 try { 419 int codePoint = Integer.parseInt(value, base); 420 if (Character.isBmpCodePoint(codePoint)) { 421 return String.valueOf((char) codePoint); 422 } else { 423 char[] surrogatePair = Character.toChars(codePoint); 424 return new String(surrogatePair); 425 } 426 } catch (NumberFormatException ex) { 427 return null; 428 } 429 } 430 } 431