1 /* 2 * Copyright Simon Pepping 2009 3 * 4 * The copyright owner licenses this file to You under the Apache License, Version 2.0 5 * (the "License"); you may not use this file except in compliance with 6 * the License. You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* $Id$ */ 18 19 package org.tug.texhyphen; 20 21 import java.io.BufferedReader; 22 import java.io.File; 23 import java.io.FileInputStream; 24 import java.io.FileReader; 25 import java.io.IOException; 26 import java.io.InputStream; 27 import java.io.InputStreamReader; 28 import java.io.Reader; 29 import java.net.URI; 30 import java.net.URISyntaxException; 31 import java.net.URL; 32 import java.net.URLConnection; 33 import java.util.Collection; 34 import java.util.Iterator; 35 import java.util.List; 36 import java.util.Vector; 37 import java.util.regex.Matcher; 38 import java.util.regex.Pattern; 39 40 import org.xml.sax.Attributes; 41 import org.xml.sax.ContentHandler; 42 import org.xml.sax.DTDHandler; 43 import org.xml.sax.EntityResolver; 44 import org.xml.sax.ErrorHandler; 45 import org.xml.sax.InputSource; 46 import org.xml.sax.SAXException; 47 import org.xml.sax.SAXNotRecognizedException; 48 import org.xml.sax.SAXNotSupportedException; 49 import org.xml.sax.XMLReader; 50 import org.xml.sax.ext.LexicalHandler; 51 import org.xml.sax.helpers.AttributesImpl; 52 53 /** 54 * The class TeXParser parses TeX hyphenation pattern files and produces SAX events 55 */ 56 public class LanguageDataParser implements XMLReader { 57 58 public static final String LANG_NAMESPACE = "urn:org:tug:texhyphen:languagedata"; 59 public static int lineLength = 72; 60 private static final int TOP_LEVEL = 3, IN_LANG = 4; 61 private static final Pattern 62 comment = Pattern.compile("#.*"), 63 langStart = Pattern.compile("{", Pattern.LITERAL), 64 langEnd = Pattern.compile("}", Pattern.LITERAL), 65 dataline = Pattern.compile("\"([^\"]+)\" ?=> ?\"([^\"]+)\","), 66 keywordline = Pattern.compile("\"([^\"]+)\" ?=> ?(false|true|nil),"), 67 listline = Pattern.compile("\"([^\"]+)\" ?=> ?\\[(\"[^\"]+\"(?:,\"[^\"]+\")*)\\],"), 68 datalistline = Pattern.compile("\"([^\"]+)\" ?=> ?\\[([^,]+(?:,[^,]+)*)\\],"), 69 space = Pattern.compile("[ \\t]+"); 70 private static final AttributesImpl emptyAtts = new AttributesImpl(); 71 72 private ContentHandler contentHandler; 73 private DTDHandler dtdHandler; 74 private EntityResolver entityResolver; 75 private ErrorHandler errorHandler; 76 private LexicalHandler lexicalHandler; 77 parseLanguageData(BufferedReader inbr)78 private void parseLanguageData(BufferedReader inbr) throws SAXException, IOException { 79 int parseState = TOP_LEVEL; 80 Language lang = null; 81 82 contentHandler.startDocument(); 83 contentHandler.startPrefixMapping("", LANG_NAMESPACE); 84 contentHandler.startElement(LANG_NAMESPACE, "languages", "languages", emptyAtts); 85 86 for (String line = inbr.readLine(); line != null; line = inbr.readLine()) { 87 Matcher matcher = comment.matcher(line).useAnchoringBounds(true); 88 int start = 0; 89 while (start < line.length()) { 90 if (matcher.usePattern(comment).lookingAt()) { 91 processComment(matcher.group(), parseState == TOP_LEVEL ? null : lang); 92 } else if (matcher.usePattern(space).lookingAt()) { 93 // do nothing 94 } else if (parseState == TOP_LEVEL && matcher.usePattern(langStart).lookingAt()) { 95 parseState = IN_LANG; 96 lang = new Language(); 97 } else if ((parseState == IN_LANG) && matcher.usePattern(langEnd).lookingAt()) { 98 pushoutLanguage(lang); 99 lang = null; 100 parseState = TOP_LEVEL; 101 } else if (parseState == IN_LANG 102 && (matcher.usePattern(dataline).lookingAt() 103 || matcher.usePattern(keywordline).lookingAt())) { 104 String key = matcher.group(1); 105 String value = matcher.group(2); 106 processDataline(key, value, lang); 107 } else if (parseState == IN_LANG 108 && (matcher.usePattern(listline).lookingAt() 109 || matcher.usePattern(datalistline).lookingAt())) { 110 String key = matcher.group(1); 111 String values = matcher.group(2); 112 processListline(key, values, lang); 113 } else { 114 break; 115 } 116 start = matcher.end(); 117 matcher = matcher.region(start, line.length()).useAnchoringBounds(true); 118 } 119 } 120 121 contentHandler.endElement(LANG_NAMESPACE, "languages", "languages"); 122 contentHandler.endPrefixMapping(LANG_NAMESPACE); 123 contentHandler.endDocument(); 124 } 125 126 static Collection<String> attributeKeys; 127 static { 128 attributeKeys = new Vector<String>(); 129 attributeKeys.add("code"); 130 attributeKeys.add("name"); 131 attributeKeys.add("use-old-patterns"); 132 attributeKeys.add("use-new-loader"); 133 attributeKeys.add("encoding"); 134 attributeKeys.add("exceptions"); 135 } 136 processComment(String comment, Language lang)137 private void processComment(String comment, Language lang) throws SAXException { 138 comment = comment.replace("--", "––"); 139 if (!comment.endsWith(" ")) { 140 comment = comment + " "; 141 } 142 if (lang == null) { 143 char[] textchars = comment.toCharArray(); 144 if (lexicalHandler != null) { 145 lexicalHandler.comment(textchars, 1, textchars.length - 1); 146 } 147 } else { 148 lang.elements.add(new Element("comment", comment)); 149 } 150 151 } 152 processDataline(String key, String value, Language lang)153 private void processDataline(String key, String value, Language lang) { 154 key = key.replace('_', '-'); 155 if (value.equals("nil")) { 156 value = ""; 157 } 158 if (attributeKeys.contains(key)) { 159 lang.atts.addAttribute("", key, key, "CDATA", value); 160 } else { 161 lang.elements.add(new Element(key, value)); 162 } 163 } 164 processListline(String key, String valuesString, Language lang)165 private void processListline(String key, String valuesString, Language lang) { 166 key = key.replace('_', '-'); 167 valuesString = valuesString.replace("\"", ""); 168 String[] values = valuesString.split(",[ \\t]*"); 169 if (attributeKeys.contains(key)) { 170 StringBuilder attValue = new StringBuilder(); 171 for (String value : values) { 172 if (!value.equals("nil")) { 173 attValue.append(" " + value); 174 } 175 } 176 lang.atts.addAttribute("", key, key, "CDATA", attValue.toString()); 177 } else if (key.equals("hyphenmin")) { 178 key = "hyphen-min"; 179 AttributesImpl atts = new AttributesImpl(); 180 atts.addAttribute("", "before", "before", "CDATA", values[0]); 181 atts.addAttribute("", "after", "after", "CDATA", values[1]); 182 lang.elements.add(new Element(key, "", atts)); 183 } else { 184 key = key.replaceAll("s$", ""); 185 for (String value : values) { 186 if (value.equals("nil")) { 187 value = ""; 188 } 189 lang.elements.add(new Element(key, value)); 190 } 191 } 192 } 193 pushoutLanguage(Language lang)194 private void pushoutLanguage(Language lang) throws SAXException { 195 contentHandler.startElement(LANG_NAMESPACE, "language", "language", lang.atts); 196 Iterator<Element> iter = lang.elements.iterator(); 197 while (iter.hasNext()) { 198 Element elt = iter.next(); 199 char[] text = elt.content.toCharArray(); 200 if (elt.tag.equals("comment")) { 201 if (lexicalHandler != null) { 202 lexicalHandler.comment(text, 1, text.length - 1); 203 } 204 } else { 205 contentHandler.startElement(LANG_NAMESPACE, elt.tag, elt.tag, elt.atts); 206 contentHandler.characters(text, 0, text.length); 207 contentHandler.endElement(LANG_NAMESPACE, elt.tag, elt.tag); 208 } 209 } 210 contentHandler.endElement(LANG_NAMESPACE, "language", "language"); 211 } 212 getReaderFromInputSource(InputSource input)213 public Reader getReaderFromInputSource(InputSource input) throws IOException { 214 Reader reader = input.getCharacterStream(); 215 String encoding = null; 216 if (reader == null) { 217 encoding = input.getEncoding(); 218 } 219 if (reader == null) { 220 InputStream stream = input.getByteStream(); 221 if (stream != null) { 222 if (encoding == null) { 223 reader = new InputStreamReader(stream); 224 } else { 225 reader = new InputStreamReader(stream, encoding); 226 } 227 } 228 } 229 if (reader == null) { 230 String systemId = input.getSystemId(); 231 reader = getReaderFromSystemId(systemId, encoding); 232 } 233 return reader; 234 } 235 getReaderFromSystemId(String systemId, String encoding)236 public Reader getReaderFromSystemId(String systemId, String encoding) throws IOException { 237 if (systemId == null) { 238 throw new IOException("Cannot create a reader from a null systemID"); 239 } 240 if (encoding.isEmpty()) { 241 encoding = null; 242 } 243 Reader reader = null; 244 URI uri = null; 245 File file = null; 246 try { 247 uri = new URI(systemId); 248 } catch (URISyntaxException e) { 249 // handled below 250 } 251 if (uri == null || !uri.isAbsolute()) { 252 file = new File(systemId); 253 } 254 if (file != null) { 255 if (encoding == null) { 256 reader = new FileReader(file); 257 } else { 258 InputStream stream = new FileInputStream(file); 259 reader = new InputStreamReader(stream, encoding); 260 } 261 } else if (uri != null && uri.getScheme().equals("http")) { 262 URL url = uri.toURL(); 263 URLConnection conn = url.openConnection(); 264 if (encoding == null) { 265 encoding = conn.getContentEncoding(); 266 } 267 InputStream stream = conn.getInputStream(); 268 reader = new InputStreamReader(stream, encoding); 269 } 270 return reader; 271 } 272 273 /* (non-Javadoc) 274 * @see org.xml.sax.XMLReader#getContentHandler() 275 */ getContentHandler()276 public ContentHandler getContentHandler() { 277 return contentHandler; 278 } 279 280 /* (non-Javadoc) 281 * @see org.xml.sax.XMLReader#getDTDHandler() 282 */ getDTDHandler()283 public DTDHandler getDTDHandler() { 284 return dtdHandler; 285 } 286 287 /* (non-Javadoc) 288 * @see org.xml.sax.XMLReader#getEntityResolver() 289 */ getEntityResolver()290 public EntityResolver getEntityResolver() { 291 return entityResolver; 292 } 293 294 /* (non-Javadoc) 295 * @see org.xml.sax.XMLReader#getErrorHandler() 296 */ getErrorHandler()297 public ErrorHandler getErrorHandler() { 298 return errorHandler; 299 } 300 301 302 /** 303 * @return the lexicalHandler 304 */ getLexicalHandler()305 public LexicalHandler getLexicalHandler() { 306 return lexicalHandler; 307 } 308 309 /* (non-Javadoc) 310 * @see org.xml.sax.XMLReader#getFeature(java.lang.String) 311 */ getFeature(String arg0)312 public boolean getFeature(String arg0) 313 throws SAXNotRecognizedException, SAXNotSupportedException { 314 throw new SAXNotSupportedException(); 315 } 316 317 /* (non-Javadoc) 318 * @see org.xml.sax.XMLReader#getProperty(java.lang.String) 319 */ getProperty(String arg0)320 public Object getProperty(String arg0) 321 throws SAXNotRecognizedException, SAXNotSupportedException { 322 throw new SAXNotSupportedException(); 323 } 324 325 /* (non-Javadoc) 326 * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource) 327 */ parse(InputSource input)328 public void parse(InputSource input) throws IOException, SAXException { 329 Reader reader = getReaderFromInputSource(input); 330 if (reader == null) { 331 throw new IOException("Could not open input source " + input); 332 } 333 BufferedReader inbr = new BufferedReader(reader); 334 parseLanguageData(inbr); 335 } 336 337 /* (non-Javadoc) 338 * @see org.xml.sax.XMLReader#parse(java.lang.String) 339 */ parse(String systemId)340 public void parse(String systemId) throws IOException, SAXException { 341 Reader reader = getReaderFromSystemId(systemId, null); 342 if (reader == null) { 343 throw new IOException("Could not open input systemID " + systemId); 344 } 345 BufferedReader inbr = new BufferedReader(reader); 346 parseLanguageData(inbr); 347 } 348 349 /* (non-Javadoc) 350 * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler) 351 */ setContentHandler(ContentHandler contenthandler)352 public void setContentHandler(ContentHandler contenthandler) { 353 this.contentHandler = contenthandler; 354 } 355 356 /* (non-Javadoc) 357 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler) 358 */ setDTDHandler(DTDHandler dtdhandler)359 public void setDTDHandler(DTDHandler dtdhandler) { 360 this.dtdHandler = dtdhandler; 361 } 362 363 /* (non-Javadoc) 364 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver) 365 */ setEntityResolver(EntityResolver entityresolver)366 public void setEntityResolver(EntityResolver entityresolver) { 367 this.entityResolver = entityresolver; 368 } 369 370 /* (non-Javadoc) 371 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 372 */ setErrorHandler(ErrorHandler errorHandler)373 public void setErrorHandler(ErrorHandler errorHandler) { 374 this.errorHandler = errorHandler; 375 } 376 377 378 /** 379 * @param lexicalHandler the lexicalHandler to set 380 */ setLexicalHandler(LexicalHandler lexicalHandler)381 public void setLexicalHandler(LexicalHandler lexicalHandler) { 382 this.lexicalHandler = lexicalHandler; 383 } 384 385 /* (non-Javadoc) 386 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean) 387 */ setFeature(String arg0, boolean arg1)388 public void setFeature(String arg0, boolean arg1) 389 throws SAXNotRecognizedException, SAXNotSupportedException { 390 throw new SAXNotSupportedException(); 391 } 392 393 /* (non-Javadoc) 394 * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object) 395 */ setProperty(String name, Object value)396 public void setProperty(String name, Object value) 397 throws SAXNotRecognizedException, SAXNotSupportedException { 398 if (name.equals("http://xml.org/sax/properties/lexical-handler")) { 399 lexicalHandler = (LexicalHandler) value; 400 } else { 401 throw new SAXNotSupportedException(); 402 } 403 } 404 405 private static class Element { 406 String tag; 407 String content; 408 Attributes atts; Element(String tag, String content)409 Element(String tag, String content) { 410 this(tag, content, LanguageDataParser.emptyAtts); 411 } Element(String tag, String content, Attributes atts)412 Element(String tag, String content, Attributes atts) { 413 this.tag = tag; 414 this.content = content; 415 this.atts = atts; 416 } 417 } 418 419 private static class Language { 420 AttributesImpl atts; 421 List<Element> elements; Language()422 Language() { 423 atts = new AttributesImpl(); 424 elements = new Vector<Element>(); 425 } 426 } 427 428 } 429