• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the  "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 /*
19  * $Id$
20  */
21 
22 package org.apache.qetest.xsl;
23 
24 import java.io.BufferedReader;
25 import java.io.FileReader;
26 import java.io.PrintWriter;
27 import java.net.URL;
28 import java.util.Properties;
29 
30 import javax.xml.parsers.DocumentBuilder;
31 import javax.xml.parsers.DocumentBuilderFactory;
32 
33 import org.apache.qetest.QetestUtils;
34 import org.w3c.dom.Attr;
35 import org.w3c.dom.Document;
36 import org.w3c.dom.Element;
37 import org.w3c.dom.NamedNodeMap;
38 import org.w3c.dom.Node;
39 import org.w3c.dom.Text;
40 import org.w3c.tidy.Tidy;
41 import org.xml.sax.ErrorHandler;
42 import org.xml.sax.InputSource;
43 import org.xml.sax.SAXException;
44 import org.xml.sax.SAXParseException;
45 
46 /**
47  * Uses an XML/HTML/Text diff comparator to check or diff two files.
48  * <p>Given two files, an actual test result and a known good or 'gold'
49  * test result, diff the two files to see if they are equal; if not, provide
50  * some very basic info on where they differ.</p>
51  *
52  * <p>Attempts to parse each file as an XML document using Xerces;
53  * if that fails, attempt to parse each as an HTML document using
54  * <i>NEED NEW HTML PARSER</i>; if that fails, pretend to parse each
55  * doc as text and construct a faux document node; then do
56  * readLine() and construct a &lt;line> element for each line.</p>
57  *
58  * <p>The comparison routine then recursively compares the two
59  * documents node-by-node; see the code for exactly how each
60  * node type is handled.  Note that some node types are currently
61  * ignored.</p>
62  *
63  * //@todo document whitespace difference handling better -sc
64  * //@todo check how XML decls are handled (or not) -sc
65  * //@todo Allow param to define the type of parse we do (i.e. if a
66  * testwriter knows their output file will be XML, we should only
67  * attempt to parse it as XML, not other types)
68  * @see XHTComparatorXSLTC for an alternate implementation of
69  * diff() which tests some things as QNames (which checks for the
70  * true namespace, instead of just the prefix)
71  * @author Scott_Boag@lotus.com
72  * @author Shane_Curcuru@lotus.com
73  * @version $Id$
74  */
75 public class XHTComparator
76 {
77 
78     /**
79      * Maximum output length we may log for differing values.
80      * When two nodes have mismatched values, we output the first
81      * two values that were mismatched.  In some cases, this may be
82      * extremely long, so limit how much we output for convenience.
83      */
84     protected int maxDisplayLen = 511;  // arbitrary length, for convenience
85 
86     /**
87      * Accessor method for maxDisplayLen.
88      * @param i maximum length we log out
89      */
setMaxDisplayLen(int i)90     public void setMaxDisplayLen(int i)
91     {
92         if (i > 0)
93             maxDisplayLen = i;
94     }
95 
96     /** Constants for reporting out reason for failed diffs. */
97     public static final String SEPARATOR = ";";
98 
99     /** LBRACKET '['  */
100     public static final String LBRACKET = "[";
101 
102     /** RBRACKET ']'  */
103     public static final String RBRACKET = "]";
104 
105     /** TEST 'test', for the actual value.  */
106     public static final String TEST = "test";
107 
108     /** GOLD 'gold' for the gold or expected value.  */
109     public static final String GOLD = "gold";
110 
111     /** PARSE_TYPE '-parse-type' */
112     public static final String PARSE_TYPE = "-parse-type" + SEPARATOR;  // postpended to TEST or GOLD
113 
114     /** OTHER_ERROR 'other-error'  */
115     public static final String OTHER_ERROR = "other-error" + SEPARATOR;
116 
117     /** WARNING 'warning'  */
118     public static final String WARNING = "warning" + SEPARATOR;
119 
120     /** MISMATCH_NODE  */
121     public static final String MISMATCH_NODE = "mismatch-node" + SEPARATOR;
122 
123     /** MISSING_TEST_NODE  */
124     public static final String MISSING_TEST_NODE = "missing-node-" + TEST
125                                                        + SEPARATOR;
126 
127     /** MISSING_GOLD_NODE  */
128     public static final String MISSING_GOLD_NODE = "missing-node-" + GOLD
129                                                        + SEPARATOR;
130 
131     /** MISMATCH_ATTRIBUTE */
132     public static final String MISMATCH_ATTRIBUTE = "mismatch-attribute"
133                                                         + SEPARATOR;
134 
135     /** MISMATCH_VALUE  */
136     public static final String MISMATCH_VALUE = "mismatch-value" + SEPARATOR;
137 
138     /** MISMATCH_VALUE  */
139     public static final String MISMATCH_VALUE_GOLD = "mismatch-value-gold" + SEPARATOR;
140 
141     /** MISMATCH_VALUE  */
142     public static final String MISMATCH_VALUE_TEXT = "mismatch-value-text" + SEPARATOR;
143 
144     /** MISSING_TEST_VALUE  */
145     public static final String MISSING_TEST_VALUE = "missing-value-" + TEST
146                                                         + SEPARATOR;
147 
148     /** MISSING_GOLD_VALUE  */
149     public static final String MISSING_GOLD_VALUE = "missing-value-" + GOLD
150                                                         + SEPARATOR;
151 
152     /** WHITESPACE_DIFF  */
153     public static final String WHITESPACE_DIFF = "whitespace-diff;";
154 
155     /**
156      * Compare two files by parsing into DOMs and comparing trees.
157      *
158      * <p>Parses the goldFileName by using the
159      * {@link #parse(String, PrintWriter, String, Properties) parse worker method}
160      * - if null, we bail and return false.  If non-null, we parse the
161      * testFileName into a Document as well.  Then we call
162      * {@link #diff(Node, Node, PrintWriter, boolean[]) diff worker method}
163      * to do the real work of comparing.</p>
164      *
165      * @param goldFileName expected file
166      * @param testFileName actual file
167      * @param reporter PrintWriter to dump status info to
168      * @param warning array of warning flags (for whitespace diffs,
169      * item[0] is set to true if we find whitespace-only diffs)
170      * @param attributes to attempt to set onto parsers
171      * @return true if they match, false otherwise
172      */
compare(String goldFileName, String testFileName, PrintWriter reporter, boolean[] warning, Properties attributes)173     public boolean compare(String goldFileName, String testFileName,
174                            PrintWriter reporter, boolean[] warning,
175                            Properties attributes)
176     {
177 
178         // parse the gold doc
179         Document goldDoc = parse(goldFileName, reporter, GOLD, attributes);
180 
181         // parse the test doc only if gold doc was parsed OK
182         //@todo Jun-02 -sc Note the logic here might be improveable to
183         //  actually report file missing problems better: i.e.
184         //  in theory, if the actual is missing, it's a fail; if
185         //  the gold (only) is missing, it's ambiguous
186         Document testDoc = (null != goldDoc)
187                            ? parse(testFileName, reporter, TEST, attributes) : null;
188 
189         if (null == goldDoc)
190         {
191             reporter.println(OTHER_ERROR + GOLD + SEPARATOR
192                              + "document null");
193 
194             return false;
195         }
196         else if (null == testDoc)
197         {
198             reporter.println(OTHER_ERROR + TEST + SEPARATOR
199                              + "document null");
200 
201             return false;
202         }
203 
204         return diff(goldDoc, testDoc, reporter, warning);
205     }
206 
207     // Reporter format:
208     // REASON_CONSTANT;gold val;test val;reason description
209 
210     /**
211      * Diff two Nodes recursively and report true if equal.
212      *
213      * <p>The contract is: when you enter here the gold and test nodes are the same type,
214      * both non-null, and both in the same basic position in the tree.
215      * //@todo verify caller really performs for the contract -sc</p>
216      *
217      * <p>See the code for how it's done; note that not all node
218      * types are actually compared currently.  Also see
219      * {@link XHTComparatorXSLTC} for an alternate implementation.</p>
220      *
221      * @param gold or expected node
222      * @param test actual node
223      * @param reporter PrintWriter to dump status info to
224      * @param warning[] if any whitespace diffs found
225      *
226      * @return true if pass, false if any problems encountered
227      */
diff(Node gold, Node test, PrintWriter reporter, boolean[] warning)228     boolean diff(Node gold, Node test, PrintWriter reporter,
229                  boolean[] warning)
230     {
231 
232         String name1 = gold.getNodeName();
233         String name2 = test.getNodeName();
234 
235         // If both there but not equal, fail
236         if ((null != name1) && (null != name2) &&!name1.equals(name2))
237         {
238             reporter.println(MISMATCH_NODE + nodeTypeString(gold) + SEPARATOR
239                            + nodeTypeString(test) + SEPARATOR
240                            + "name does not equal test node");
241 
242             return false;
243         }
244         else if ((null != name1) && (null == name2))
245         {
246             reporter.println(MISSING_TEST_NODE + nodeTypeString(gold)
247                            + SEPARATOR + nodeTypeString(test) + SEPARATOR
248                            + "name missing on test");
249 
250             return false;
251         }
252         else if ((null == name1) && (null != name2))
253         {
254             reporter.println(MISSING_GOLD_NODE + nodeTypeString(gold)
255                            + SEPARATOR + nodeTypeString(test) + SEPARATOR
256                            + "name missing on gold");
257 
258             return false;
259         }
260 
261         String value1 = gold.getNodeValue();
262         String value2 = test.getNodeValue();
263 
264         if ((null != value1) && (null != value2) &&!value1.equals(value2))
265         {
266             reporter.println(MISMATCH_VALUE + nodeTypeString(gold) + "len="
267                            + value1.length() + SEPARATOR
268                            + nodeTypeString(test) + "len=" + value2.length()
269                            + SEPARATOR + "values do not match");
270             printNodeDiff(gold, test, reporter);
271             return false;
272         }
273         else if ((null != value1) && (null == value2))
274         {
275             reporter.println(MISSING_TEST_VALUE + nodeTypeString(gold) + "-"
276                            + value1 + SEPARATOR + nodeTypeString(test)
277                            + SEPARATOR + "test no value");
278 
279             return false;
280         }
281         else if ((null == value1) && (null != value2))
282         {
283             reporter.println(MISSING_GOLD_VALUE + nodeTypeString(gold)
284                            + SEPARATOR + nodeTypeString(test) + "-" + value2
285                            + SEPARATOR + "gold no value");
286 
287             return false;
288         }
289 
290         switch (gold.getNodeType())
291         {
292         case Node.DOCUMENT_NODE :
293         {
294 
295             // Why don't we do anything here? -sc
296         }
297         break;
298         case Node.ELEMENT_NODE :
299         {
300 
301             // Explicitly ignore attribute ordering
302             // TODO do we need to make this settable for testing purposes? -sc
303             NamedNodeMap goldAttrs = gold.getAttributes();
304             NamedNodeMap testAttrs = test.getAttributes();
305 
306             if ((null != goldAttrs) && (null == testAttrs))
307             {
308                 reporter.println(MISMATCH_ATTRIBUTE + nodeTypeString(gold)
309                                + SEPARATOR + nodeTypeString(test) + SEPARATOR
310                                + "test no attrs");
311 
312                 return false;
313             }
314             else if ((null == goldAttrs) && (null != testAttrs))
315             {
316                 reporter.println(MISMATCH_ATTRIBUTE + nodeTypeString(gold)
317                                + SEPARATOR + nodeTypeString(test) + SEPARATOR
318                                + "gold no attrs");
319 
320                 return false;
321             }
322 
323             int gn = goldAttrs.getLength();
324             int tn = testAttrs.getLength();
325 
326             if (gn != tn)
327             {
328                 reporter.println(MISMATCH_ATTRIBUTE + nodeTypeString(gold)
329                                + "-" + gn + SEPARATOR + nodeTypeString(test)
330                                + "-" + tn + SEPARATOR
331                                + "attribte count mismatch");
332 
333                 // TODO: add output of each set of attrs for comparisons
334                 return false;
335             }
336 
337             // TODO verify this checks the full list of attributes both ways,
338             //      from gold->test and from test->gold -sc
339             for (int i = 0; i < gn; i++)
340             {
341                 Attr goldAttr = (Attr) goldAttrs.item(i);
342                 String goldAttrName = goldAttr.getName();
343                 Node testAttr = testAttrs.getNamedItem(goldAttrName);
344 
345                 if (null == testAttr)
346                 {
347                     reporter.println(MISMATCH_ATTRIBUTE + nodeTypeString(gold)
348                                    + "-" + goldAttrName + SEPARATOR
349                                    + nodeTypeString(test) + SEPARATOR
350                                    + "missing attribute on test");
351 
352                     return false;
353                 }
354 
355                 if (!diff(goldAttr, testAttr, reporter, warning))
356                 {
357                     return false;
358                 }
359             }
360         }
361         break;
362         case Node.CDATA_SECTION_NODE :{}
363         break;
364         case Node.ENTITY_REFERENCE_NODE :{}
365         break;
366         case Node.ATTRIBUTE_NODE :{}
367         break;
368         case Node.COMMENT_NODE :{}
369         break;
370         case Node.ENTITY_NODE :{}
371         break;
372         case Node.NOTATION_NODE :{}
373         break;
374         case Node.PROCESSING_INSTRUCTION_NODE :{}
375         break;
376         case Node.TEXT_NODE :{}
377         break;
378         default :{}
379         }
380 
381         Node try2[] = new Node[2];
382         Node goldChild = gold.getFirstChild();
383         Node testChild = test.getFirstChild();
384 
385         if (!basicChildCompare(goldChild, testChild, reporter, warning, try2))
386             return false;
387 
388         goldChild = try2[0];
389         testChild = try2[1];
390 
391         while (null != goldChild)
392         {
393             if (!diff(goldChild, testChild, reporter, warning))
394                 return false;
395 
396             goldChild = goldChild.getNextSibling();
397             testChild = testChild.getNextSibling();
398 
399             if (!basicChildCompare(goldChild, testChild, reporter, warning,
400                                    try2))
401                 return false;
402 
403             goldChild = try2[0];
404             testChild = try2[1];
405         }
406 
407         return true;
408     }  // end of diff()
409 
410     /**
411      * Returns Character.isWhitespace
412      * @param s String to check for whitespace
413      * @return true if all whitespace; false otherwise
414      */
isWhiteSpace(String s)415     boolean isWhiteSpace(String s)
416     {
417 
418         int n = s.length();
419 
420         for (int i = 0; i < n; i++)
421         {
422             if (!Character.isWhitespace(s.charAt(i)))
423                 return false;
424         }
425 
426         return true;
427     }  // end of isWhiteSpace()
428 
429     /**
430      * NEEDSDOC Method tryToAdvancePastWhitespace
431      *
432      *
433      * @param n node to check if it's whitespace
434      * @param reporter PrintWriter to dump status info to
435      * @param warning set to true if we advance past a
436      * whitespace node; note that this logic isn't quite
437      * correct, I think (it should only be set if
438      * we advance past whitespace that isn't equal in
439      * both trees or something like that)
440      * @param next array of nodes to continue thru
441      * @param which index into next array
442      *
443      * @return Node we should be at after advancing
444      */
tryToAdvancePastWhitespace(Node n, PrintWriter reporter, boolean[] warning, Node next[], int which)445     Node tryToAdvancePastWhitespace(Node n, PrintWriter reporter,
446                                     boolean[] warning, Node next[], int which)
447     {
448 
449         if (n.getNodeType() == Node.TEXT_NODE)
450         {
451             String data = n.getNodeValue();
452 
453             if (null != data)
454             {
455                 if (isWhiteSpace(data))
456                 {
457                     warning[0] = true;
458 
459                     reporter.print(WHITESPACE_DIFF + " ");  // TODO check the format of this; maybe use println -sc
460 
461                     n = n.getNextSibling();
462                     next[which] = n;
463                 }
464             }
465         }
466 
467         return n;
468     }  // end of tryToAdvancePastWhitespace()
469 
470     /**
471      * NEEDSDOC Method basicChildCompare
472      *
473      *
474      * NEEDSDOC @param gold
475      * NEEDSDOC @param test
476      * @param reporter PrintWriter to dump status info to
477      * NEEDSDOC @param warning
478      * NEEDSDOC @param next
479      *
480      * NEEDSDOC (basicChildCompare) @return
481      */
basicChildCompare(Node gold, Node test, PrintWriter reporter, boolean[] warning, Node next[])482     boolean basicChildCompare(Node gold, Node test, PrintWriter reporter,
483                               boolean[] warning, Node next[])
484     {
485 
486         next[0] = gold;
487         next[1] = test;
488 
489         boolean alreadyTriedToAdvance = false;
490 
491         if ((null != gold) && (null == test))
492         {
493             gold = tryToAdvancePastWhitespace(gold, reporter, warning, next,
494                                               0);
495             alreadyTriedToAdvance = true;
496 
497             if ((null != gold) && (null == test))
498             {
499                 reporter.println(MISSING_TEST_NODE + nodeTypeString(gold)
500                                + SEPARATOR + SEPARATOR
501                                + "missing node on test");
502 
503                 return false;
504             }
505         }
506         else if ((null == gold) && (null != test))
507         {
508             test = tryToAdvancePastWhitespace(test, reporter, warning, next,
509                                               1);
510             alreadyTriedToAdvance = true;
511 
512             if ((null == gold) && (null != test))
513             {
514                 reporter.println(MISSING_GOLD_NODE + SEPARATOR
515                                + nodeTypeString(test) + SEPARATOR
516                                + "missing node on gold");
517 
518                 return false;
519             }
520         }
521 
522         if ((null != gold) && (gold.getNodeType() != test.getNodeType()))
523         {
524             Node savedGold = gold;
525             Node savedTest = test;
526 
527             if (!alreadyTriedToAdvance)
528             {
529                 gold = tryToAdvancePastWhitespace(gold, reporter, warning,
530                                                   next, 0);
531 
532                 if (gold == savedGold)
533                 {
534                     test = tryToAdvancePastWhitespace(test, reporter,
535                                                       warning, next, 1);
536                 }
537             }
538 
539             if ((null != gold) && (gold.getNodeType() != test.getNodeType()))
540             {
541                 gold = savedGold;
542                 test = savedTest;
543 
544                 reporter.println(MISMATCH_NODE + nodeTypeString(gold)
545                                + SEPARATOR + nodeTypeString(test) + SEPARATOR
546                                + "node type mismatch");
547                 printNodeDiff(gold, test, reporter);
548 
549                 return false;
550             }
551         }
552 
553         return true;
554     }  // end of basicChildCompare()
555 
556     /**
557      * Cheap-o text printout of a node.  By Scott.
558      *
559      * @param n node to print info for
560      * @return String of getNodeType plus getNodeName
561      */
nodeTypeString(Node n)562     public static String nodeTypeString(Node n)
563     {
564         switch (n.getNodeType())
565         {
566         case Node.DOCUMENT_NODE :
567             return "DOCUMENT(" + n.getNodeName() + ")";
568         case Node.ELEMENT_NODE :
569             return "ELEMENT(" + n.getNodeName() + ")";
570         case Node.CDATA_SECTION_NODE :
571             return "CDATA_SECTION(" + n.getNodeName() + ")";
572         case Node.ENTITY_REFERENCE_NODE :
573             return "ENTITY_REFERENCE(" + n.getNodeName() + ")";
574         case Node.ATTRIBUTE_NODE :
575             return "ATTRIBUTE(" + n.getNodeName() + ")";
576         case Node.COMMENT_NODE :
577             return "COMMENT(" + n.getNodeName() + ")";
578         case Node.ENTITY_NODE :
579             return "ENTITY(" + n.getNodeName() + ")";
580         case Node.NOTATION_NODE :
581             return "NOTATION(" + n.getNodeName() + ")";
582         case Node.PROCESSING_INSTRUCTION_NODE :
583             return "PROCESSING_INSTRUCTION(" + n.getNodeName() + ")";
584         case Node.TEXT_NODE :
585             return "TEXT()"; // #text is all that's ever printed out, so skip it
586         default :
587             return "UNKNOWN(" + n.getNodeName() + ")";
588         }
589     }  // end of nodeTypeString()
590 
591 
592     /**
593      * Cheap-o text printout of two different nodes.
594      *
595      * @param goldNode or expected node to print info
596      * @param testNode or actual node to print info
597      * @param n node to print info for
598      * @param reporter PrintWriter to dump status info to
599      */
printNodeDiff(Node goldNode, Node testNode, PrintWriter reporter)600     public void printNodeDiff(Node goldNode, Node testNode, PrintWriter reporter)
601     {
602         String goldValue = goldNode.getNodeValue();
603         String testValue = testNode.getNodeValue();
604         if (null == goldValue)
605             goldValue = "null";
606         if (null == testValue)
607             testValue = "null";
608 
609         // Limit length we output to logs; extremely long values
610         //  are more hassle than they're worth (at that point,
611         //  it's either obvious what the problem is, or it's
612         //  such a small problem that you'll need to manually
613         //  compare the files separately
614         if (goldValue.length() > maxDisplayLen)
615             goldValue = goldValue.substring(0, maxDisplayLen);
616         if (testValue.length() > maxDisplayLen)
617             testValue = testValue.substring(0, maxDisplayLen);
618         reporter.println(MISMATCH_VALUE_GOLD + nodeTypeString(goldNode) + SEPARATOR + "\n" + goldValue);
619         reporter.println(MISMATCH_VALUE_TEXT + nodeTypeString(testNode) + SEPARATOR + "\n" + testValue);
620     }
621 
622 
623     /**
624      * Simple worker method to parse filename to a Document.
625      *
626      * <p>Attempts XML parse, if that throws an exception, then
627      * we attempt an HTML parse (when parser available), if
628      * that throws an exception, then we parse as text:
629      * we construct a faux document element to hold it all,
630      * and then parse by readLine() and put each line of
631      * text into a &lt;line> element.</p>
632      *
633      * @param filename to parse as a local path
634      * @param reporter PrintWriter to dump status info to
635      * @param which either TEST or GOLD file being parsed
636      * @param attributes name=value pairs to set on the
637      * DocumentBuilderFactory that we use to parse
638      *
639      * @return Document object with contents of the file;
640      * otherwise throws an unchecked RuntimeException if there
641      * is any fatal problem
642      */
parse(String filename, PrintWriter reporter, String which, Properties attributes)643     Document parse(String filename, PrintWriter reporter, String which, Properties attributes)
644     {
645         // Force filerefs to be URI's if needed: note this is independent of any other files
646         String docURI = QetestUtils.filenameToURL(filename);
647 
648         DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance();
649         // Always set namespaces on
650         dfactory.setNamespaceAware(true);
651         // Set other attributes here as needed
652         applyAttributes(dfactory, attributes);
653 
654         // Local class: cheap non-printing ErrorHandler
655         // This is used to suppress validation warnings which
656         //  would otherwise clutter up the console
657         ErrorHandler nullHandler = new ErrorHandler() {
658             public void warning(SAXParseException e) throws SAXException {}
659             public void error(SAXParseException e) throws SAXException {}
660             public void fatalError(SAXParseException e) throws SAXException
661             {
662                 throw e;
663             }
664         };
665 
666         String parseType = which + PARSE_TYPE + "[xml];";
667         Document doc = null;
668         try
669         {
670             // First, attempt to parse as XML (preferred)...
671             DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
672             docBuilder.setErrorHandler(nullHandler);
673             doc = docBuilder.parse(new InputSource(docURI));
674         }
675         catch (Throwable se)
676         {
677             // ... if we couldn't parse as XML, attempt parse as HTML...
678             reporter.println(WARNING + se.toString());
679             parseType = which + PARSE_TYPE + "[html];";
680 
681             try
682             {
683                 // Use the copy of Tidy that the XSLTC team has checked in
684                 // Submitted by: Gunnar Klauberg <gklauberg@yahoo.de>
685                 // Alternate by: Santiago.PericasGeertsen@sun.com
686     	        Tidy tidy = new Tidy();
687     	        tidy.setXHTML(true);
688     	        tidy.setTidyMark(false);
689     	        tidy.setShowWarnings(false);
690                 tidy.setShowErrors(0);
691     	        tidy.setQuiet(true);
692     	        doc = tidy.parseDOM(new URL(docURI).openStream(), null);
693             }
694             catch (Exception e)
695             {
696                 // ... if we can't parse as HTML, then just parse the text
697                 try
698                 {
699                     reporter.println(WARNING + e.toString());
700                     parseType = which + PARSE_TYPE + "[text];";
701 
702                     // First build a faux document with parent element
703                     DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
704                     doc = docBuilder.newDocument();
705                     Element outElem = doc.createElement("out");
706 
707                     // Parse as text, line by line
708                     //   Since we already know it should be text, this should
709                     //   work better than parsing by bytes.
710                     FileReader fr = new FileReader(filename);
711                     BufferedReader br = new BufferedReader(fr);
712                     for (;;)
713                     {
714                         String tmp = br.readLine();
715 
716                         if (tmp == null)
717                         {
718                             break;
719                         }
720                         // An additional thing we could do would
721                         //  be to put in the line number in the
722                         //  file in here somehow, so when users
723                         //  view reports, they get that info
724                         Element lineElem = doc.createElement("line");
725                         outElem.appendChild(lineElem);
726                         Text textNode = doc.createTextNode(tmp);
727                         lineElem.appendChild(textNode);
728                     }
729                     // Now stick the whole element into the document to return
730                     doc.appendChild(outElem);
731                 }
732                 catch (Throwable throwable)
733                 {
734                     reporter.println(OTHER_ERROR + filename + SEPARATOR
735                                    + "threw:" + throwable.toString());
736                 }
737             }
738         }
739 
740         // Output a newline here for readability
741         reporter.println(parseType);
742 
743         return doc;
744     }  // end of parse()
745 
746     /**
747      * Pass applicable attributes onto our DocumentBuilderFactory.
748      *
749      * Only passes thru attributes we explicitly know about and
750      * are constants from XHTFileCheckService.
751      *
752      * @param dbf factory to attempt to set* onto
753      * @param attrs various attributes we should try to set
754      */
applyAttributes(DocumentBuilderFactory dfactory, Properties attributes)755     protected void applyAttributes(DocumentBuilderFactory dfactory, Properties attributes)
756     {
757         if ((null == attributes) || (null == dfactory))
758             return;
759 
760         String tmp = attributes.getProperty(XHTFileCheckService.SETVALIDATING);
761         if (null != tmp)
762         {
763             dfactory.setValidating(new Boolean(tmp).booleanValue());
764         }
765         tmp = attributes.getProperty(XHTFileCheckService.SETIGNORINGELEMENTCONTENTWHITESPACE);
766         if (null != tmp)
767         {
768             dfactory.setIgnoringElementContentWhitespace(new Boolean(tmp).booleanValue());
769         }
770         tmp = attributes.getProperty(XHTFileCheckService.SETEXPANDENTITYREFERENCES);
771         if (null != tmp)
772         {
773             dfactory.setExpandEntityReferences(new Boolean(tmp).booleanValue());
774         }
775         tmp = attributes.getProperty(XHTFileCheckService.SETIGNORINGCOMMENTS);
776         if (null != tmp)
777         {
778             dfactory.setIgnoringComments(new Boolean(tmp).booleanValue());
779         }
780         tmp = attributes.getProperty(XHTFileCheckService.SETCOALESCING);
781         if (null != tmp)
782         {
783             dfactory.setCoalescing(new Boolean(tmp).booleanValue());
784         }
785         /* Unknown attributes are ignored! */
786     }
787 
788 }
789