1 /* 2 ****************************************************************************** 3 * Copyright (C) 2003-2014, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 /** 8 * @author Ram Viswanadha 9 * 10 * This tool validates xml against DTD or valid XML ... IE 6 does not do a good job 11 */ 12 package org.unicode.cldr.util; 13 14 import java.io.BufferedReader; 15 import java.io.File; 16 import java.io.FileInputStream; 17 import java.io.FileReader; 18 import java.io.FilenameFilter; 19 import java.io.IOException; 20 21 import javax.xml.parsers.DocumentBuilder; 22 import javax.xml.parsers.DocumentBuilderFactory; 23 24 import org.w3c.dom.Document; 25 import org.w3c.dom.Element; 26 import org.w3c.dom.Text; 27 import org.xml.sax.ErrorHandler; 28 import org.xml.sax.InputSource; 29 import org.xml.sax.SAXException; 30 import org.xml.sax.SAXParseException; 31 32 @CLDRTool(alias = "validate", description = "Check XML files for validity") 33 public class XMLValidator { 34 public static boolean quiet = false; 35 public static boolean parseonly = false; 36 main(String[] args)37 public static void main(String[] args) throws IOException { 38 if (args.length == 0) { 39 System.out.println("No files specified. Validation failed. Use --help for help."); 40 return; 41 } 42 for (int i = 0; i < args.length; i++) { 43 if (args[i].equals("-q") || args[i].equals("--quiet")) { 44 quiet = true; 45 } else if (args[i].equals("--help")) { 46 usage(); 47 return; 48 } else if (args[i].equals("--parseonly")) { 49 System.err.println("# DTD Validation is disabled. Will only check for well formed XML."); 50 parseonly = true; 51 } else { 52 File f = new File(args[i]); 53 if (f.isDirectory()) { 54 parseDirectory(f); 55 } else { 56 if (!quiet) System.out.println("Processing file " + args[i]); 57 new fileParserThread(args[i]).run(); 58 } 59 } 60 } 61 if (parseonly) { 62 System.err.println("# DTD Validation is disabled. Only checked for well formed XML."); 63 } 64 } 65 parseDirectory(File f)66 private static void parseDirectory(File f) throws IOException { 67 // System.err.println("Parsing directory " + f.getAbsolutePath()); 68 for (File s : f.listFiles(new FilenameFilter() { 69 @Override 70 public boolean accept(File arg0, String arg1) { 71 if (arg1.startsWith(".")) { 72 return false; // skip .git, .svn, ... 73 } 74 File n = new File(arg0, arg1); 75 // System.err.println("Considering " + n.getAbsolutePath() ); 76 if (n.isDirectory()) { 77 try { 78 parseDirectory(n); 79 } catch (IOException e) { 80 // TODO Auto-generated catch block 81 e.printStackTrace(); 82 System.err.println("Error " + e.toString() + " parsing " + arg0.getPath()); 83 } 84 return false; 85 } else if (arg1.endsWith(".xml")) { 86 return true; 87 } else { 88 return false; 89 } 90 } 91 })) { 92 if (!quiet) System.out.println("Processing file " + s.getPath()); 93 new fileParserThread(s.getCanonicalPath()).run(); 94 } 95 } 96 usage()97 private static void usage() { 98 System.err.println("usage: " + XMLValidator.class.getName() + " [ -q ] [ --help ] [ --parseonly ] file ..."); 99 System.err.println("usage: " + XMLValidator.class.getName() 100 + " [ -q ] [ --help ] [ --parseonly ] directory ..."); 101 } 102 103 /** 104 * Utility method to translate a String filename to URL. 105 * 106 * Note: This method is not necessarily proven to get the correct URL for 107 * every possible kind of filename; it should be improved. It handles the 108 * most common cases that we've encountered when running Conformance tests 109 * on Xalan. Also note, this method does not handle other non-file: flavors 110 * of URLs at all. 111 * 112 * If the name is null, return null. If the name starts with a common URI 113 * scheme (namely the ones found in the examples of RFC2396), then simply 114 * return the name as-is (the assumption is that it's already a URL) 115 * Otherwise we attempt (cheaply) to convert to a file:/// URL. 116 * 117 * @param filename 118 * a local path/filename of a file 119 * @return a file:/// URL, the same string if it appears to already be a 120 * URL, or null if error 121 */ filenameToURL(String filename)122 public static String filenameToURL(String filename) { 123 // null begets null - something like the commutative property 124 if (null == filename) 125 return null; 126 127 // Don't translate a string that already looks like a URL 128 if (filename.startsWith("file:") || filename.startsWith("http:") 129 || filename.startsWith("ftp:") 130 || filename.startsWith("gopher:") 131 || filename.startsWith("mailto:") 132 || filename.startsWith("news:") 133 || filename.startsWith("telnet:")) 134 return filename; 135 136 File f = new File(filename); 137 String tmp = null; 138 try { 139 // This normally gives a better path 140 tmp = f.getCanonicalPath(); 141 } catch (IOException ioe) { 142 // But this can be used as a backup, for cases 143 // where the file does not exist, etc. 144 tmp = f.getAbsolutePath(); 145 } 146 147 // URLs must explicitly use only forward slashes 148 if (File.separatorChar == '\\') { 149 tmp = tmp.replace('\\', '/'); 150 } 151 // Note the presumption that it's a file reference 152 // Ensure we have the correct number of slashes at the 153 // start: we always want 3 /// if it's absolute 154 // (which we should have forced above) 155 if (tmp.startsWith("/")) 156 return "file://" + tmp; 157 else 158 return "file:///" + tmp; 159 160 } 161 162 public static class fileParserThread extends Thread { 163 String filename; 164 fileParserThread(String _filename)165 fileParserThread(String _filename) { 166 filename = _filename; 167 } 168 run()169 public void run() { 170 // Force filerefs to be URI's if needed: note this is independent of any 171 // other files 172 String docURI = filenameToURL(filename); 173 parse(new InputSource(docURI), filename); 174 } 175 } 176 parse(InputSource docSrc, String filename)177 static Document parse(InputSource docSrc, String filename) { 178 179 // Check for BOM. 180 try { 181 FileInputStream fis = null; 182 try { 183 fis = new FileInputStream(filename); 184 byte bytes[] = new byte[3]; 185 if (fis.read(bytes) == 3 && 186 bytes[0] == (byte) 0xef && 187 bytes[1] == (byte) 0xbb && 188 bytes[2] == (byte) 0xbf) { 189 System.err.println(filename + ": ERROR: contains UTF-8 BOM (shouldn't happen in CLDR XML files)"); 190 } 191 } finally { 192 if (fis != null) { 193 fis.close(); 194 } 195 } 196 } catch (IOException ioe) { /* ignored- other branches will report an error. */ 197 } 198 199 DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance(); 200 // Always set namespaces on 201 if (!parseonly) { 202 dfactory.setNamespaceAware(true); 203 dfactory.setValidating(true); 204 } 205 // Set other attributes here as needed 206 // applyAttributes(dfactory, attributes); 207 208 // Local class: cheap non-printing ErrorHandler 209 // This is used to suppress validation warnings 210 final String filename2 = filename; 211 ErrorHandler nullHandler = new ErrorHandler() { 212 public void warning(SAXParseException e) throws SAXException { 213 System.err.println(filename2 + ": Warning: " + e.getMessage()); 214 215 } 216 217 public void error(SAXParseException e) throws SAXException { 218 int col = e.getColumnNumber(); 219 System.err.println(filename2 + ":" + e.getLineNumber() + (col >= 0 ? ":" + col : "") 220 + ": ERROR: Element " + e.getPublicId() 221 + " is not valid because " + e.getMessage()); 222 } 223 224 public void fatalError(SAXParseException e) throws SAXException { 225 System.err.println(filename2 + ": ERROR "); 226 throw e; 227 } 228 }; 229 230 Document doc = null; 231 try { 232 // First, attempt to parse as XML (preferred)... 233 DocumentBuilder docBuilder = dfactory.newDocumentBuilder(); 234 docBuilder.setErrorHandler(nullHandler); 235 docBuilder.setEntityResolver(new CachingEntityResolver()); 236 // if(docBuilder.isValidating()){ 237 // System.out.println("The parser is a validating parser"); 238 // } 239 doc = docBuilder.parse(docSrc); 240 } catch (Throwable se) { 241 // ... if we couldn't parse as XML, attempt parse as HTML... 242 if (se instanceof SAXParseException) { 243 SAXParseException pe = (SAXParseException) se; 244 int col = pe.getColumnNumber(); 245 System.err.println(filename + ":" + pe.getLineNumber() + (col >= 0 ? ":" + col : "") + ": ERROR:" 246 + se.toString()); 247 } else { 248 System.err.println(filename + ": ERROR:" + se.toString()); 249 } 250 try { 251 // @todo need to find an HTML to DOM parser we can use!!! 252 // doc = someHTMLParser.parse(new InputSource(filename)); 253 throw new RuntimeException(filename + ": XMLComparator not HTML parser!"); 254 } catch (Exception e) { 255 if (filename != null) { 256 // ... if we can't parse as HTML, then just parse the text 257 try { 258 259 // Parse as text, line by line 260 // Since we already know it should be text, this should 261 // work better than parsing by bytes. 262 FileReader fr = new FileReader(filename); 263 BufferedReader br = new BufferedReader(fr); 264 StringBuffer buffer = new StringBuffer(); 265 for (;;) { 266 String tmp = br.readLine(); 267 268 if (tmp == null) { 269 break; 270 } 271 272 buffer.append(tmp); 273 buffer.append("\n"); // Put in the newlines as well 274 } 275 br.close(); 276 DocumentBuilder docBuilder = dfactory 277 .newDocumentBuilder(); 278 doc = docBuilder.newDocument(); 279 Element outElem = doc.createElement("out"); 280 Text textNode = doc.createTextNode(buffer.toString()); 281 282 // Note: will this always be a valid node? If we're 283 // parsing 284 // in as text, will there ever be cases where the diff that's 285 // done later on will fail becuase some really garbage-like 286 // text has been put into a node? 287 outElem.appendChild(textNode); 288 doc.appendChild(outElem); 289 } catch (Throwable throwable) { 290 291 // throwable.printStackTrace(); 292 } 293 } 294 } 295 } 296 return doc; 297 } 298 } 299