1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. 2 // 3 // TagSoup is licensed under the Apache License, 4 // Version 2.0. You may obtain a copy of this license at 5 // http://www.apache.org/licenses/LICENSE-2.0 . You may also have 6 // additional legal rights not granted by this license. 7 // 8 // TagSoup is distributed in the hope that it will be useful, but 9 // unless required by applicable law or agreed to in writing, TagSoup 10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 11 // OF ANY KIND, either express or implied; not even the implied warranty 12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 // 14 // 15 // The TagSoup command line UI 16 17 package org.ccil.cowan.tagsoup; 18 import java.util.Hashtable; 19 import java.util.Enumeration; 20 import java.io.*; 21 import java.net.URL; 22 import java.net.URLConnection; 23 import org.xml.sax.*; 24 import org.xml.sax.helpers.DefaultHandler; 25 import org.xml.sax.ext.LexicalHandler; 26 27 28 /** 29 The stand-alone TagSoup program. 30 **/ 31 public class CommandLine { 32 33 static Hashtable options = new Hashtable(); static { 34 options.put("--nocdata", Boolean.FALSE); // CDATA elements are normal 35 options.put("--files", Boolean.FALSE); // process arguments as separate files 36 options.put("--reuse", Boolean.FALSE); // reuse a single Parser 37 options.put("--nons", Boolean.FALSE); // no namespaces 38 options.put("--nobogons", Boolean.FALSE); // suppress unknown elements 39 options.put("--any", Boolean.FALSE); // unknowns have ANY content model 40 options.put("--emptybogons", Boolean.FALSE); // unknowns have EMPTY content model 41 options.put("--norootbogons", Boolean.FALSE); // unknowns can't be the root 42 options.put("--pyxin", Boolean.FALSE); // input is PYX 43 options.put("--lexical", Boolean.FALSE); // output comments 44 options.put("--pyx", Boolean.FALSE); // output is PYX 45 options.put("--html", Boolean.FALSE); // output is HTML 46 options.put("--method=", Boolean.FALSE); // output method 47 options.put("--doctype-public=", Boolean.FALSE); // override public id 48 options.put("--doctype-system=", Boolean.FALSE); // override system id 49 options.put("--output-encoding=", Boolean.FALSE); // output encoding 50 options.put("--omit-xml-declaration", Boolean.FALSE); // omit XML decl 51 options.put("--encoding=", Boolean.FALSE); // specify encoding 52 options.put("--help", Boolean.FALSE); // display help 53 options.put("--version", Boolean.FALSE); // display version 54 options.put("--nodefaults", Boolean.FALSE); // no default attrs 55 options.put("--nocolons", Boolean.FALSE); // colon to underscore 56 options.put("--norestart", Boolean.FALSE); // no restartable elements 57 options.put("--ignorable", Boolean.FALSE); // return ignorable whitespace 58 } 59 60 /** 61 Main method. Processes specified files or standard input. 62 **/ 63 main(String[] argv)64 public static void main(String[] argv) throws IOException, SAXException { 65 int optind = getopts(options, argv); 66 if (hasOption(options, "--help")) { 67 doHelp(); 68 return; 69 } 70 if (hasOption(options, "--version")) { 71 System.err.println("TagSoup version 1.2"); 72 return; 73 } 74 if (argv.length == optind) { 75 process("", System.out); 76 } 77 else if (hasOption(options, "--files")) { 78 for (int i = optind; i < argv.length; i++) { 79 String src = argv[i]; 80 String dst; 81 int j = src.lastIndexOf('.'); 82 if (j == -1) 83 dst = src + ".xhtml"; 84 else if (src.endsWith(".xhtml")) 85 dst = src + "_"; 86 else 87 dst = src.substring(0, j) + ".xhtml"; 88 System.err.println("src: " + src + " dst: " + dst); 89 OutputStream os = new FileOutputStream(dst); 90 process(src, os); 91 } 92 } 93 else { 94 for (int i = optind; i < argv.length; i++) { 95 System.err.println("src: " + argv[i]); 96 process(argv[i], System.out); 97 } 98 } 99 } 100 101 // Print the help message 102 doHelp()103 private static void doHelp() { 104 System.err.print("usage: java -jar tagsoup-*.jar "); 105 System.err.print(" [ "); 106 boolean first = true; 107 for (Enumeration e = options.keys(); e.hasMoreElements(); ) { 108 if (!first) { 109 System.err.print("| "); 110 } 111 first = false; 112 String key = (String)(e.nextElement()); 113 System.err.print(key); 114 if (key.endsWith("=")) 115 System.err.print("?"); 116 System.err.print(" "); 117 } 118 System.err.println("]*"); 119 } 120 121 private static Parser theParser = null; 122 private static HTMLSchema theSchema = null; 123 private static String theOutputEncoding = null; 124 125 // Process one source onto an output stream. 126 process(String src, OutputStream os)127 private static void process(String src, OutputStream os) 128 throws IOException, SAXException { 129 XMLReader r; 130 if (hasOption(options, "--reuse")) { 131 if (theParser == null) theParser = new Parser(); 132 r = theParser; 133 } 134 else { 135 r = new Parser(); 136 } 137 theSchema = new HTMLSchema(); 138 r.setProperty(Parser.schemaProperty, theSchema); 139 140 if (hasOption(options, "--nocdata")) { 141 r.setFeature(Parser.CDATAElementsFeature, false); 142 } 143 144 if (hasOption(options, "--nons") || hasOption(options, "--html")) { 145 r.setFeature(Parser.namespacesFeature, false); 146 } 147 148 if (hasOption(options, "--nobogons")) { 149 r.setFeature(Parser.ignoreBogonsFeature, true); 150 } 151 152 if (hasOption(options, "--any")) { 153 r.setFeature(Parser.bogonsEmptyFeature, false); 154 } 155 else if (hasOption(options, "--emptybogons")) { 156 r.setFeature(Parser.bogonsEmptyFeature, true); 157 } 158 159 if (hasOption(options, "--norootbogons")) { 160 r.setFeature(Parser.rootBogonsFeature, false); 161 } 162 163 if (hasOption(options, "--nodefaults")) { 164 r.setFeature(Parser.defaultAttributesFeature, false); 165 } 166 if (hasOption(options, "--nocolons")) { 167 r.setFeature(Parser.translateColonsFeature, true); 168 } 169 170 if (hasOption(options, "--norestart")) { 171 r.setFeature(Parser.restartElementsFeature, false); 172 } 173 174 if (hasOption(options, "--ignorable")) { 175 r.setFeature(Parser.ignorableWhitespaceFeature, true); 176 } 177 178 if (hasOption(options, "--pyxin")) { 179 r.setProperty(Parser.scannerProperty, new PYXScanner()); 180 } 181 182 Writer w; 183 if (theOutputEncoding == null) { 184 w = new OutputStreamWriter(os); 185 } 186 else { 187 w = new OutputStreamWriter(os, theOutputEncoding); 188 } 189 ContentHandler h = chooseContentHandler(w); 190 r.setContentHandler(h); 191 if (hasOption(options, "--lexical") && h instanceof LexicalHandler) { 192 r.setProperty(Parser.lexicalHandlerProperty, h); 193 } 194 InputSource s = new InputSource(); 195 if (src != "") { 196 s.setSystemId(src); 197 } 198 else { 199 s.setByteStream(System.in); 200 } 201 if (hasOption(options, "--encoding=")) { 202 // System.out.println("%% Found --encoding"); 203 String encoding = (String)options.get("--encoding="); 204 if (encoding != null) s.setEncoding(encoding); 205 } 206 r.parse(s); 207 } 208 209 // Pick a content handler to generate the desired format. 210 chooseContentHandler(Writer w)211 private static ContentHandler chooseContentHandler(Writer w) { 212 XMLWriter x; 213 if (hasOption(options, "--pyx")) { 214 return new PYXWriter(w); 215 } 216 217 x = new XMLWriter(w); 218 if (hasOption(options, "--html")) { 219 x.setOutputProperty(XMLWriter.METHOD, "html"); 220 x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes"); 221 } 222 if (hasOption(options, "--method=")) { 223 String method = (String)options.get("--method="); 224 if (method != null) { 225 x.setOutputProperty(XMLWriter.METHOD, method); 226 } 227 } 228 if (hasOption(options, "--doctype-public=")) { 229 String doctype_public = (String)options.get("--doctype-public="); 230 if (doctype_public != null) { 231 x.setOutputProperty(XMLWriter.DOCTYPE_PUBLIC, doctype_public); 232 } 233 } 234 if (hasOption(options, "--doctype-system=")) { 235 String doctype_system = (String)options.get("--doctype-system="); 236 if (doctype_system != null) { 237 x.setOutputProperty(XMLWriter.DOCTYPE_SYSTEM, doctype_system); 238 } 239 } 240 if (hasOption(options, "--output-encoding=")) { 241 theOutputEncoding = (String)options.get("--output-encoding="); 242 // System.err.println("%%%% Output encoding is " + theOutputEncoding); 243 if (theOutputEncoding != null) { 244 x.setOutputProperty(XMLWriter.ENCODING, theOutputEncoding); 245 } 246 } 247 if (hasOption(options, "--omit-xml-declaration")) { 248 x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes"); 249 } 250 x.setPrefix(theSchema.getURI(), ""); 251 return x; 252 } 253 254 // Options processing 255 getopts(Hashtable options, String[] argv)256 private static int getopts(Hashtable options, String[] argv) { 257 int optind; 258 for (optind = 0; optind < argv.length; optind++) { 259 String arg = argv[optind]; 260 String value = null; 261 if (arg.charAt(0) != '-') break; 262 int eqsign = arg.indexOf('='); 263 if (eqsign != -1) { 264 value = arg.substring(eqsign + 1, arg.length()); 265 arg = arg.substring(0, eqsign + 1); 266 } 267 if (options.containsKey(arg)) { 268 if (value == null) options.put(arg, Boolean.TRUE); 269 else options.put(arg, value); 270 // System.out.println("%% Parsed [" + arg + "]=[" + value + "]"); 271 } 272 else { 273 System.err.print("Unknown option "); 274 System.err.println(arg); 275 System.exit(1); 276 } 277 } 278 return optind; 279 } 280 281 // Return true if an option exists. 282 hasOption(Hashtable options, String option)283 private static boolean hasOption(Hashtable options, String option) { 284 if (Boolean.getBoolean(option)) return true; 285 else if (options.get(option) != Boolean.FALSE) return true; 286 return false; 287 } 288 289 } 290