1 /* 2 * Copyright Simon Pepping 2009 3 * 4 * The copyright owner licenses this file to You under the Apache License, Version 2.0 5 * (the "License"); you may not use this file except in compliance with 6 * the License. You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* $Id$ */ 18 19 package org.tug.texhyphen; 20 21 import java.io.BufferedReader; 22 import java.io.File; 23 import java.io.FileInputStream; 24 import java.io.FileReader; 25 import java.io.IOException; 26 import java.io.InputStream; 27 import java.io.InputStreamReader; 28 import java.io.Reader; 29 import java.net.URI; 30 import java.net.URISyntaxException; 31 import java.net.URL; 32 import java.net.URLConnection; 33 import java.util.Stack; 34 import java.util.regex.Matcher; 35 import java.util.regex.Pattern; 36 37 import org.xml.sax.ContentHandler; 38 import org.xml.sax.DTDHandler; 39 import org.xml.sax.EntityResolver; 40 import org.xml.sax.ErrorHandler; 41 import org.xml.sax.InputSource; 42 import org.xml.sax.SAXException; 43 import org.xml.sax.SAXNotRecognizedException; 44 import org.xml.sax.SAXNotSupportedException; 45 import org.xml.sax.XMLReader; 46 import org.xml.sax.ext.LexicalHandler; 47 import org.xml.sax.helpers.AttributesImpl; 48 49 /** 50 * The class TeXParser parses TeX hyphenation pattern files and produces SAX events 51 */ 52 public class TeXPatternParser implements XMLReader { 53 54 public static final String TEX_NAMESPACE = "urn:org:tug:texhyphen"; 55 private static final int TOP_LEVEL = 3, IN_COMMAND = 4, AFTER_COMMAND = 5, IN_DATA = 6; 56 private static final Pattern 57 comment = Pattern.compile("%.*"), 58 commandStart = Pattern.compile("\\\\"), 59 command = Pattern.compile("[a-zA-Z]+"), 60 space = Pattern.compile(" +"), 61 argOpen = Pattern.compile("\\{"), 62 argClose = Pattern.compile("\\}"), 63 text = Pattern.compile("[^%\\\\\\{\\}]+"); 64 private static final AttributesImpl emptyAtts = new AttributesImpl(); 65 66 private ContentHandler contentHandler; 67 private DTDHandler dtdHandler; 68 private EntityResolver entityResolver; 69 private ErrorHandler errorHandler; 70 private LexicalHandler lexicalHandler; 71 parsePatterns(BufferedReader inbr)72 private void parsePatterns(BufferedReader inbr) throws SAXException, IOException { 73 int parseState = TOP_LEVEL; 74 Stack<String> stack = new Stack<String>(); 75 76 contentHandler.startDocument(); 77 contentHandler.startPrefixMapping("", TEX_NAMESPACE); 78 contentHandler.startElement(TEX_NAMESPACE, "tex", "tex", emptyAtts); 79 80 for (String line = inbr.readLine(); line != null; line = inbr.readLine()) { 81 Matcher matcher = comment.matcher(line).useAnchoringBounds(true); 82 int start = 0; 83 char[] textchars; 84 boolean inComment = false; 85 while (start < line.length()) { 86 if (matcher.usePattern(comment).lookingAt()) { 87 String text = matcher.group().replace("--", "––"); 88 textchars = (text + "\n").toCharArray(); 89 if (lexicalHandler != null) { 90 lexicalHandler.comment(textchars, 1, textchars.length - 1); 91 } 92 inComment = true; 93 } else if (parseState == IN_DATA && matcher.usePattern(text).lookingAt()) { 94 textchars = matcher.group().toCharArray(); 95 contentHandler.characters(textchars, 0, textchars.length); 96 } else if (parseState != IN_DATA && matcher.usePattern(space).lookingAt()) { 97 if (parseState == TOP_LEVEL) { 98 textchars = matcher.group().toCharArray(); 99 contentHandler.ignorableWhitespace(textchars, 0, textchars.length); 100 } 101 } else if (parseState == TOP_LEVEL && matcher.usePattern(commandStart).lookingAt()) { 102 parseState = IN_COMMAND; 103 } else if (parseState == IN_COMMAND && matcher.usePattern(command).lookingAt()) { 104 String tag = matcher.group(); 105 contentHandler.startElement(TEX_NAMESPACE, tag, tag, emptyAtts); 106 stack.push(tag); 107 parseState = AFTER_COMMAND; 108 } else if (parseState == AFTER_COMMAND && matcher.usePattern(argOpen).lookingAt()) { 109 parseState = IN_DATA; 110 } else if (parseState == IN_DATA && matcher.usePattern(argClose).lookingAt()) { 111 String tag = stack.pop(); 112 contentHandler.endElement(TEX_NAMESPACE, tag, tag); 113 parseState = TOP_LEVEL; 114 } else { 115 break; 116 } 117 start = matcher.end(); 118 matcher = matcher.region(start, line.length()).useAnchoringBounds(true); 119 } 120 textchars = "\n".toCharArray(); 121 if (parseState == IN_DATA && !inComment) { 122 contentHandler.characters(textchars, 0, textchars.length); 123 } else if (parseState == TOP_LEVEL && !inComment) { 124 contentHandler.ignorableWhitespace(textchars, 0, textchars.length); 125 } 126 } 127 128 contentHandler.endElement(TEX_NAMESPACE, "tex", "tex"); 129 contentHandler.endPrefixMapping(TEX_NAMESPACE); 130 contentHandler.endDocument(); 131 } 132 getReaderFromInputSource(InputSource input)133 public Reader getReaderFromInputSource(InputSource input) throws IOException { 134 Reader reader = input.getCharacterStream(); 135 String encoding = null; 136 if (reader == null) { 137 encoding = input.getEncoding(); 138 } 139 if (reader == null) { 140 InputStream stream = input.getByteStream(); 141 if (stream != null) { 142 if (encoding == null) { 143 reader = new InputStreamReader(stream); 144 } else { 145 reader = new InputStreamReader(stream, encoding); 146 } 147 } 148 } 149 if (reader == null) { 150 String systemId = input.getSystemId(); 151 reader = getReaderFromSystemId(systemId, encoding); 152 } 153 return reader; 154 } 155 getReaderFromSystemId(String systemId, String encoding)156 public Reader getReaderFromSystemId(String systemId, String encoding) throws IOException { 157 if (systemId == null) { 158 throw new IOException("Cannot create a reader from a null systemID"); 159 } 160 if (encoding.isEmpty()) { 161 encoding = null; 162 } 163 Reader reader = null; 164 URI uri = null; 165 File file = null; 166 try { 167 uri = new URI(systemId); 168 } catch (URISyntaxException e) { 169 // handled below 170 } 171 if (uri == null || !uri.isAbsolute()) { 172 file = new File(systemId); 173 } 174 if (file != null) { 175 if (encoding == null) { 176 reader = new FileReader(file); 177 } else { 178 InputStream stream = new FileInputStream(file); 179 reader = new InputStreamReader(stream, encoding); 180 } 181 } else if (uri != null && uri.getScheme().equals("http")) { 182 URL url = uri.toURL(); 183 URLConnection conn = url.openConnection(); 184 if (encoding == null) { 185 encoding = conn.getContentEncoding(); 186 } 187 InputStream stream = conn.getInputStream(); 188 reader = new InputStreamReader(stream, encoding); 189 } 190 return reader; 191 } 192 193 /* (non-Javadoc) 194 * @see org.xml.sax.XMLReader#getContentHandler() 195 */ getContentHandler()196 public ContentHandler getContentHandler() { 197 return contentHandler; 198 } 199 200 /* (non-Javadoc) 201 * @see org.xml.sax.XMLReader#getDTDHandler() 202 */ getDTDHandler()203 public DTDHandler getDTDHandler() { 204 return dtdHandler; 205 } 206 207 /* (non-Javadoc) 208 * @see org.xml.sax.XMLReader#getEntityResolver() 209 */ getEntityResolver()210 public EntityResolver getEntityResolver() { 211 return entityResolver; 212 } 213 214 /* (non-Javadoc) 215 * @see org.xml.sax.XMLReader#getErrorHandler() 216 */ getErrorHandler()217 public ErrorHandler getErrorHandler() { 218 return errorHandler; 219 } 220 221 222 /** 223 * @return the lexicalHandler 224 */ getLexicalHandler()225 public LexicalHandler getLexicalHandler() { 226 return lexicalHandler; 227 } 228 229 /* (non-Javadoc) 230 * @see org.xml.sax.XMLReader#getFeature(java.lang.String) 231 */ getFeature(String arg0)232 public boolean getFeature(String arg0) 233 throws SAXNotRecognizedException, SAXNotSupportedException { 234 throw new SAXNotSupportedException(); 235 } 236 237 /* (non-Javadoc) 238 * @see org.xml.sax.XMLReader#getProperty(java.lang.String) 239 */ getProperty(String arg0)240 public Object getProperty(String arg0) 241 throws SAXNotRecognizedException, SAXNotSupportedException { 242 throw new SAXNotSupportedException(); 243 } 244 245 /* (non-Javadoc) 246 * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource) 247 */ parse(InputSource input)248 public void parse(InputSource input) throws IOException, SAXException { 249 Reader reader = getReaderFromInputSource(input); 250 if (reader == null) { 251 throw new IOException("Could not open input source " + input); 252 } 253 BufferedReader inbr = new BufferedReader(reader); 254 parsePatterns(inbr); 255 } 256 257 /* (non-Javadoc) 258 * @see org.xml.sax.XMLReader#parse(java.lang.String) 259 */ parse(String systemId)260 public void parse(String systemId) throws IOException, SAXException { 261 Reader reader = getReaderFromSystemId(systemId, null); 262 if (reader == null) { 263 throw new IOException("Could not open input systemID " + systemId); 264 } 265 BufferedReader inbr = new BufferedReader(reader); 266 parsePatterns(inbr); 267 } 268 269 /* (non-Javadoc) 270 * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler) 271 */ setContentHandler(ContentHandler contenthandler)272 public void setContentHandler(ContentHandler contenthandler) { 273 this.contentHandler = contenthandler; 274 } 275 276 /* (non-Javadoc) 277 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler) 278 */ setDTDHandler(DTDHandler dtdhandler)279 public void setDTDHandler(DTDHandler dtdhandler) { 280 this.dtdHandler = dtdhandler; 281 } 282 283 /* (non-Javadoc) 284 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver) 285 */ setEntityResolver(EntityResolver entityresolver)286 public void setEntityResolver(EntityResolver entityresolver) { 287 this.entityResolver = entityresolver; 288 } 289 290 /* (non-Javadoc) 291 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 292 */ setErrorHandler(ErrorHandler errorHandler)293 public void setErrorHandler(ErrorHandler errorHandler) { 294 this.errorHandler = errorHandler; 295 } 296 297 298 /** 299 * @param lexicalHandler the lexicalHandler to set 300 */ setLexicalHandler(LexicalHandler lexicalHandler)301 public void setLexicalHandler(LexicalHandler lexicalHandler) { 302 this.lexicalHandler = lexicalHandler; 303 } 304 305 /* (non-Javadoc) 306 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean) 307 */ setFeature(String arg0, boolean arg1)308 public void setFeature(String arg0, boolean arg1) 309 throws SAXNotRecognizedException, SAXNotSupportedException { 310 throw new SAXNotSupportedException(); 311 } 312 313 /* (non-Javadoc) 314 * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object) 315 */ setProperty(String name, Object value)316 public void setProperty(String name, Object value) 317 throws SAXNotRecognizedException, SAXNotSupportedException { 318 if (name.equals("http://xml.org/sax/properties/lexical-handler")) { 319 lexicalHandler = (LexicalHandler) value; 320 } else { 321 throw new SAXNotSupportedException(); 322 } 323 } 324 325 } 326