• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.harmony.xml.parsers;
18 
19 import java.io.IOException;
20 import java.util.StringTokenizer;
21 
22 import javax.xml.parsers.DocumentBuilder;
23 
24 import org.kxml2.io.KXmlParser;
25 import org.w3c.dom.Attr;
26 import org.w3c.dom.DOMImplementation;
27 import org.w3c.dom.Document;
28 import org.w3c.dom.Element;
29 import org.w3c.dom.Node;
30 import org.xml.sax.EntityResolver;
31 import org.xml.sax.ErrorHandler;
32 import org.xml.sax.InputSource;
33 import org.xml.sax.SAXException;
34 import org.xml.sax.SAXParseException;
35 import org.xml.sax.helpers.LocatorImpl;
36 import org.xmlpull.v1.XmlPullParser;
37 import org.xmlpull.v1.XmlPullParserException;
38 
39 import org.apache.harmony.xml.dom.DOMImplementationImpl;
40 
41 /**
42  * Provides a straightforward DocumentBuilder implementation based on
43  * XMLPull/KXML. The class is used internally only, thus only notable members
44  * that are not already in the abstract superclass are documented. Hope that's
45  * ok.
46  */
47 class DocumentBuilderImpl extends DocumentBuilder {
48 
49     private static DOMImplementation dom = DOMImplementationImpl.getInstance();
50 
51     private EntityResolver entityResolver;
52 
53     private ErrorHandler errorHandler;
54 
55     private boolean ignoreComments;
56 
57     private boolean ignoreElementContentWhitespace;
58 
59     private boolean namespaceAware;
60 
DocumentBuilderImpl()61     DocumentBuilderImpl() {
62         // Do nothing.
63     }
64 
65     @Override
getDOMImplementation()66     public DOMImplementation getDOMImplementation() {
67         return dom;
68     }
69 
70     /**
71      * Reflects whether this DocumentBuilder is configured to ignore comments.
72      *
73      * @return True if and only if comments are ignored.
74      */
isIgnoringComments()75     public boolean isIgnoringComments() {
76         return ignoreComments;
77     }
78 
79     /**
80      * Reflects whether this DocumentBuilder is configured to ignore element
81      * content whitespace.
82      *
83      * @return True if and only if whitespace element content is ignored.
84      */
isIgnoringElementContentWhitespace()85     public boolean isIgnoringElementContentWhitespace() {
86         return ignoreElementContentWhitespace;
87     }
88 
89     @Override
isNamespaceAware()90     public boolean isNamespaceAware() {
91         return namespaceAware;
92     }
93 
94     @Override
isValidating()95     public boolean isValidating() {
96         return false;
97     }
98 
99     @Override
newDocument()100     public Document newDocument() {
101         return dom.createDocument(null, null, null);
102     }
103 
104     @Override
parse(InputSource source)105     public Document parse(InputSource source) throws SAXException, IOException {
106         if (source == null) {
107             throw new IllegalArgumentException();
108         }
109 
110         Document document = newDocument();
111 
112         try {
113             XmlPullParser parser = new KXmlParser();
114 
115             parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES,
116                     namespaceAware);
117 
118             if (source.getByteStream() != null) {
119                 parser.setInput(source.getByteStream(), source.getEncoding());
120             } else if (source.getCharacterStream() != null) {
121                 parser.setInput(source.getCharacterStream());
122             } else {
123                 // TODO Accept other sources as well?
124                 throw new SAXParseException(
125                         "InputSource needs either stream or reader", null);
126             }
127 
128             if(parser.nextToken() == XmlPullParser.END_DOCUMENT) {
129                 throw new SAXParseException(
130                         "Unexpected end of document", null);
131             }
132 
133             parse(parser, document, document, XmlPullParser.END_DOCUMENT);
134 
135             parser.require(XmlPullParser.END_DOCUMENT, null, null);
136         } catch (XmlPullParserException ex) {
137             if(ex.getDetail() instanceof IOException) {
138                 throw (IOException)ex.getDetail();
139             }
140             if(ex.getDetail() instanceof RuntimeException) {
141                 throw (RuntimeException)ex.getDetail();
142             }
143 
144             LocatorImpl locator = new LocatorImpl();
145 
146             locator.setPublicId(source.getPublicId());
147             locator.setSystemId(source.getSystemId());
148             locator.setLineNumber(ex.getLineNumber());
149             locator.setColumnNumber(ex.getColumnNumber());
150 
151             SAXParseException newEx = new SAXParseException(ex.getMessage(),
152                     locator);
153 
154             if (errorHandler != null) {
155                 errorHandler.error(newEx);
156             }
157 
158             throw newEx;
159         }
160 
161         return document;
162     }
163 
164     /**
165      * Implements the whole parsing of the XML document. The XML pull parser is
166      * actually more of a tokenizer, and we are doing a classical recursive
167      * descent parsing (the method invokes itself for XML elements). Our
168      * approach to parsing does accept some illegal documents (more than one
169      * root element, for example). The assumption is that the DOM implementation
170      * throws the proper exceptions in these cases.
171      *
172      * @param parser The XML pull parser we're reading from.
173      * @param document The document we're building.
174      * @param node The node we're currently on (initially the document itself).
175      * @param endToken The token that will end this recursive call. Either
176      *        XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
177      *
178      * @throws XmlPullParserException If a parsing error occurs.
179      * @throws IOException If a general IO error occurs.
180      */
parse(XmlPullParser parser, Document document, Node node, int endToken)181     private void parse(XmlPullParser parser, Document document, Node node,
182             int endToken) throws XmlPullParserException, IOException {
183 
184         int token = parser.getEventType();
185 
186         /*
187          * The main parsing loop. The precondition is that we are already on the
188          * token to be processed. This holds for each iteration of the loop, so
189          * the inner statements have to ensure that (in particular the recursive
190          * call).
191          */
192         while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
193             if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
194                 /*
195                  * Found a processing instructions. We need to split the token
196                  * text at the first whitespace character.
197                  */
198                 String text = parser.getText();
199 
200                 int dot = text.indexOf(' ');
201 
202                 String target = (dot != -1 ? text.substring(0, dot) : text);
203                 String data = (dot != -1 ? text.substring(dot + 1) : "");
204 
205                 node.appendChild(document.createProcessingInstruction(target,
206                         data));
207             } else if (token == XmlPullParser.DOCDECL) {
208                 /*
209                  * Found a document type declaration. Unfortunately KXML doesn't
210                  * have the necessary details. Do we parse it ourselves, or do
211                  * we silently ignore it, since it isn't mandatory in DOM 2
212                  * anyway?
213                  */
214                 StringTokenizer tokenizer = new StringTokenizer(parser.getText());
215                 if (tokenizer.hasMoreTokens()) {
216                     String name = tokenizer.nextToken();
217                     String pubid = null;
218                     String sysid = null;
219 
220                     if (tokenizer.hasMoreTokens()) {
221                         String text = tokenizer.nextToken();
222 
223                         if ("SYSTEM".equals(text)) {
224                             if (tokenizer.hasMoreTokens()) {
225                                 sysid = tokenizer.nextToken();
226                             }
227                         } else if ("PUBLIC".equals(text)) {
228                             if (tokenizer.hasMoreTokens()) {
229                                 pubid = tokenizer.nextToken();
230                             }
231                             if (tokenizer.hasMoreTokens()) {
232                                 sysid = tokenizer.nextToken();
233                             }
234                         }
235                     }
236 
237                     if (pubid != null && pubid.length() >= 2 && pubid.startsWith("\"") && pubid.endsWith("\"")) {
238                         pubid = pubid.substring(1, pubid.length() - 1);
239                     }
240 
241                     if (sysid != null && sysid.length() >= 2 && sysid.startsWith("\"") && sysid.endsWith("\"")) {
242                         sysid = sysid.substring(1, sysid.length() - 1);
243                     }
244 
245                     document.appendChild(dom.createDocumentType(name, pubid, sysid));
246                 }
247 
248             } else if (token == XmlPullParser.COMMENT) {
249                 /*
250                  * Found a comment. We simply take the token text, but we only
251                  * create a node if the client wants to see comments at all.
252                  */
253                 if (!ignoreComments) {
254                     node.appendChild(document.createComment(parser.getText()));
255                 }
256             } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
257                 /*
258                  * Found some ignorable whitespace. We simply take the token
259                  * text, but we only create a node if the client wants to see
260                  * whitespace at all.
261                  */
262                 if (!ignoreElementContentWhitespace) {
263                     node.appendChild(document.createTextNode(parser.getText()));
264                 }
265             } else if (token == XmlPullParser.TEXT) {
266                 /*
267                  * Found a piece of text. That's the easiest case. We simply
268                  * take it and create a corresponding node.
269                  */
270                 node.appendChild(document.createTextNode(parser.getText()));
271             } else if (token == XmlPullParser.CDSECT) {
272                 /*
273                  * Found a CDATA section. That's also trivial. We simply
274                  * take it and create a corresponding node.
275                  */
276                 node.appendChild(document.createCDATASection(parser.getText()));
277             } else if (token == XmlPullParser.ENTITY_REF) {
278                 /*
279                  * Found an entity reference. If an entity resolver is
280                  * installed, we replace it by text (if possible). Otherwise we
281                  * add an entity reference node.
282                  */
283                 String entity = parser.getName();
284 
285                 if (entityResolver != null) {
286                     // TODO Implement this...
287                 }
288 
289                 String replacement = resolveStandardEntity(entity);
290                 if (replacement != null) {
291                     node.appendChild(document.createTextNode(replacement));
292                 } else {
293                     node.appendChild(document.createEntityReference(entity));
294                 }
295             } else if (token == XmlPullParser.START_TAG) {
296                 /*
297                  * Found an element start tag. We create an element node with
298                  * the proper info and attributes. We then invoke parse()
299                  * recursively to handle the next level of nesting. When we
300                  * return from this call, we check that we are on the proper
301                  * element end tag. The whole handling differs somewhat
302                  * depending on whether the parser is namespace-aware or not.
303                  */
304                 if (namespaceAware) {
305                     // Collect info for element node
306                     String namespace = parser.getNamespace();
307                     String name = parser.getName();
308                     String prefix = parser.getPrefix();
309 
310                     if ("".equals(namespace)) {
311                         namespace = null;
312                     }
313 
314                     // Create element node and wire it correctly
315                     Element element = document.createElementNS(namespace, name);
316                     element.setPrefix(prefix);
317                     node.appendChild(element);
318 
319                     for (int i = 0; i < parser.getAttributeCount(); i++) {
320                         // Collect info for a single attribute node
321                         String attrNamespace = parser.getAttributeNamespace(i);
322                         String attrPrefix = parser.getAttributePrefix(i);
323                         String attrName = parser.getAttributeName(i);
324                         String attrValue = parser.getAttributeValue(i);
325 
326                         if ("".equals(attrNamespace)) {
327                             attrNamespace = null;
328                         }
329 
330                         // Create attribute node and wire it correctly
331                         Attr attr = document.createAttributeNS(attrNamespace, attrName);
332                         attr.setPrefix(attrPrefix);
333                         attr.setValue(attrValue);
334                         element.setAttributeNodeNS(attr);
335                     }
336 
337                     // Recursive descent
338                     token = parser.nextToken();
339                     parse(parser, document, element, XmlPullParser.END_TAG);
340 
341                     // Expect the element's end tag here
342                     parser.require(XmlPullParser.END_TAG, namespace, name);
343 
344                 } else {
345                     // Collect info for element node
346                     String name = parser.getName();
347 
348                     // Create element node and wire it correctly
349                     Element element = document.createElement(name);
350                     node.appendChild(element);
351 
352                     for (int i = 0; i < parser.getAttributeCount(); i++) {
353                         // Collect info for a single attribute node
354                         String attrName = parser.getAttributeName(i);
355                         String attrValue = parser.getAttributeValue(i);
356 
357                         // Create attribute node and wire it correctly
358                         Attr attr = document.createAttribute(attrName);
359                         attr.setValue(attrValue);
360                         element.setAttributeNode(attr);
361                     }
362 
363                     // Recursive descent
364                     token = parser.nextToken();
365                     parse(parser, document, element, XmlPullParser.END_TAG);
366 
367                     // Expect the element's end tag here
368                     parser.require(XmlPullParser.END_TAG, "", name);
369                 }
370             }
371 
372             token = parser.nextToken();
373         }
374     }
375 
376     @Override
setEntityResolver(EntityResolver resolver)377     public void setEntityResolver(EntityResolver resolver) {
378         entityResolver = resolver;
379     }
380 
381     @Override
setErrorHandler(ErrorHandler handler)382     public void setErrorHandler(ErrorHandler handler) {
383         errorHandler = handler;
384     }
385 
386     /**
387      * Controls whether this DocumentBuilder ignores comments.
388      *
389      * @param value Turns comment ignorance on or off.
390      */
setIgnoreComments(boolean value)391     public void setIgnoreComments(boolean value) {
392         ignoreComments = value;
393     }
394 
395     /**
396      * Controls whether this DocumentBuilder ignores element content whitespace.
397      *
398      * @param value Turns element whitespace content ignorance on or off.
399      */
setIgnoreElementContentWhitespace(boolean value)400     public void setIgnoreElementContentWhitespace(boolean value) {
401         ignoreElementContentWhitespace = value;
402     }
403 
404     /**
405      * Controls whether this DocumentBuilder is namespace-aware.
406      *
407      * @param value Turns namespace awareness on or off.
408      */
setNamespaceAware(boolean value)409     public void setNamespaceAware(boolean value) {
410         namespaceAware = value;
411     }
412 
413     /**
414      * Resolves one of the five standard XML entities.
415      *
416      * @param entity The name of the entity to resolve, not including
417      *               the ampersand or the semicolon.
418      *
419      * @return The proper replacement, or null, if the entity is unknown.
420      */
resolveStandardEntity(String entity)421     private String resolveStandardEntity(String entity) {
422         if ("lt".equals(entity)) {
423             return "<";
424         } else if ("gt".equals(entity)) {
425             return ">";
426         } else if ("amp".equals(entity)) {
427             return "&";
428         } else if ("apos".equals(entity)) {
429             return "'";
430         } else if ("quot".equals(entity)) {
431             return "\"";
432         } else {
433             return null;
434         }
435     }
436 }
437