• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright Simon Pepping 2009
3  *
4  * The copyright owner licenses this file to You under the Apache License, Version 2.0
5  * (the "License"); you may not use this file except in compliance with
6  * the License.  You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /* $Id$ */
18 
19 package org.tug.texhyphen;
20 
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.FileInputStream;
24 import java.io.FileReader;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.InputStreamReader;
28 import java.io.Reader;
29 import java.net.URI;
30 import java.net.URISyntaxException;
31 import java.net.URL;
32 import java.net.URLConnection;
33 import java.util.Stack;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36 
37 import org.xml.sax.ContentHandler;
38 import org.xml.sax.DTDHandler;
39 import org.xml.sax.EntityResolver;
40 import org.xml.sax.ErrorHandler;
41 import org.xml.sax.InputSource;
42 import org.xml.sax.SAXException;
43 import org.xml.sax.SAXNotRecognizedException;
44 import org.xml.sax.SAXNotSupportedException;
45 import org.xml.sax.XMLReader;
46 import org.xml.sax.ext.LexicalHandler;
47 import org.xml.sax.helpers.AttributesImpl;
48 
49 /**
50  * The class TeXParser parses TeX hyphenation pattern files and produces SAX events
51  */
52 public class TeXPatternParser implements XMLReader {
53 
54     public static final String TEX_NAMESPACE = "urn:org:tug:texhyphen";
55     private static final int TOP_LEVEL = 3, IN_COMMAND = 4, AFTER_COMMAND = 5, IN_DATA = 6;
56     private static final Pattern
57     comment = Pattern.compile("%.*"),
58     commandStart = Pattern.compile("\\\\"),
59     command = Pattern.compile("[a-zA-Z]+"),
60     space = Pattern.compile(" +"),
61     argOpen = Pattern.compile("\\{"),
62     argClose = Pattern.compile("\\}"),
63     text = Pattern.compile("[^%\\\\\\{\\}]+");
64     private static final AttributesImpl emptyAtts = new AttributesImpl();
65 
66     private ContentHandler contentHandler;
67     private DTDHandler dtdHandler;
68     private EntityResolver entityResolver;
69     private ErrorHandler errorHandler;
70     private LexicalHandler lexicalHandler;
71 
parsePatterns(BufferedReader inbr)72     private void parsePatterns(BufferedReader inbr) throws SAXException, IOException {
73         int parseState = TOP_LEVEL;
74         Stack<String> stack = new Stack<String>();
75 
76         contentHandler.startDocument();
77         contentHandler.startPrefixMapping("", TEX_NAMESPACE);
78         contentHandler.startElement(TEX_NAMESPACE, "tex", "tex", emptyAtts);
79 
80         for (String line = inbr.readLine(); line != null; line = inbr.readLine()) {
81             Matcher matcher = comment.matcher(line).useAnchoringBounds(true);
82             int start = 0;
83             char[] textchars;
84             boolean inComment = false;
85             while (start < line.length()) {
86                 if (matcher.usePattern(comment).lookingAt()) {
87                     String text = matcher.group().replace("--", "––");
88                     textchars = (text + "\n").toCharArray();
89                     if (lexicalHandler != null) {
90                         lexicalHandler.comment(textchars, 1, textchars.length - 1);
91                     }
92                     inComment = true;
93                 } else if (parseState == IN_DATA && matcher.usePattern(text).lookingAt()) {
94                     textchars = matcher.group().toCharArray();
95                     contentHandler.characters(textchars, 0, textchars.length);
96                 } else if (parseState != IN_DATA && matcher.usePattern(space).lookingAt()) {
97                     if (parseState == TOP_LEVEL) {
98                         textchars = matcher.group().toCharArray();
99                         contentHandler.ignorableWhitespace(textchars, 0, textchars.length);
100                     }
101                 } else if (parseState == TOP_LEVEL && matcher.usePattern(commandStart).lookingAt()) {
102                     parseState = IN_COMMAND;
103                 } else if (parseState == IN_COMMAND && matcher.usePattern(command).lookingAt()) {
104                     String tag = matcher.group();
105                     contentHandler.startElement(TEX_NAMESPACE, tag, tag, emptyAtts);
106                     stack.push(tag);
107                     parseState = AFTER_COMMAND;
108                 } else if (parseState == AFTER_COMMAND && matcher.usePattern(argOpen).lookingAt()) {
109                     parseState = IN_DATA;
110                 } else if (parseState == IN_DATA && matcher.usePattern(argClose).lookingAt()) {
111                     String tag = stack.pop();
112                     contentHandler.endElement(TEX_NAMESPACE, tag, tag);
113                     parseState = TOP_LEVEL;
114                 } else {
115                     break;
116                 }
117                 start = matcher.end();
118                 matcher = matcher.region(start, line.length()).useAnchoringBounds(true);
119             }
120                 textchars = "\n".toCharArray();
121                 if (parseState == IN_DATA && !inComment) {
122                     contentHandler.characters(textchars, 0, textchars.length);
123                 } else if (parseState == TOP_LEVEL && !inComment) {
124                     contentHandler.ignorableWhitespace(textchars, 0, textchars.length);
125                 }
126         }
127 
128         contentHandler.endElement(TEX_NAMESPACE, "tex", "tex");
129         contentHandler.endPrefixMapping(TEX_NAMESPACE);
130         contentHandler.endDocument();
131     }
132 
getReaderFromInputSource(InputSource input)133     public Reader getReaderFromInputSource(InputSource input) throws IOException {
134         Reader reader = input.getCharacterStream();
135         String encoding = null;
136         if (reader == null) {
137             encoding = input.getEncoding();
138         }
139         if (reader == null) {
140             InputStream stream = input.getByteStream();
141             if (stream != null) {
142                 if (encoding == null) {
143                     reader = new InputStreamReader(stream);
144                 } else {
145                     reader = new InputStreamReader(stream, encoding);
146                 }
147             }
148         }
149         if (reader == null) {
150             String systemId = input.getSystemId();
151             reader = getReaderFromSystemId(systemId, encoding);
152         }
153         return reader;
154     }
155 
getReaderFromSystemId(String systemId, String encoding)156     public Reader getReaderFromSystemId(String systemId, String encoding) throws IOException {
157         if (systemId == null) {
158             throw new IOException("Cannot create a reader from a null systemID");
159         }
160         if (encoding.isEmpty()) {
161             encoding = null;
162         }
163         Reader reader = null;
164         URI uri = null;
165         File file = null;
166         try {
167             uri = new URI(systemId);
168         } catch (URISyntaxException e) {
169             // handled below
170         }
171         if (uri == null || !uri.isAbsolute()) {
172             file = new File(systemId);
173         }
174         if (file != null) {
175             if (encoding == null) {
176                 reader = new FileReader(file);
177             } else {
178                 InputStream stream = new FileInputStream(file);
179                 reader = new InputStreamReader(stream, encoding);
180             }
181         } else if (uri != null && uri.getScheme().equals("http")) {
182             URL url = uri.toURL();
183             URLConnection conn = url.openConnection();
184             if (encoding == null) {
185                 encoding = conn.getContentEncoding();
186             }
187             InputStream stream = conn.getInputStream();
188             reader = new InputStreamReader(stream, encoding);
189         }
190         return reader;
191     }
192 
193     /* (non-Javadoc)
194      * @see org.xml.sax.XMLReader#getContentHandler()
195      */
getContentHandler()196     public ContentHandler getContentHandler() {
197         return contentHandler;
198     }
199 
200     /* (non-Javadoc)
201      * @see org.xml.sax.XMLReader#getDTDHandler()
202      */
getDTDHandler()203     public DTDHandler getDTDHandler() {
204         return dtdHandler;
205     }
206 
207     /* (non-Javadoc)
208      * @see org.xml.sax.XMLReader#getEntityResolver()
209      */
getEntityResolver()210     public EntityResolver getEntityResolver() {
211         return entityResolver;
212     }
213 
214     /* (non-Javadoc)
215      * @see org.xml.sax.XMLReader#getErrorHandler()
216      */
getErrorHandler()217     public ErrorHandler getErrorHandler() {
218         return errorHandler;
219     }
220 
221 
222     /**
223      * @return the lexicalHandler
224      */
getLexicalHandler()225     public LexicalHandler getLexicalHandler() {
226         return lexicalHandler;
227     }
228 
229     /* (non-Javadoc)
230      * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
231      */
getFeature(String arg0)232     public boolean getFeature(String arg0)
233     throws SAXNotRecognizedException, SAXNotSupportedException {
234         throw new SAXNotSupportedException();
235     }
236 
237     /* (non-Javadoc)
238      * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
239      */
getProperty(String arg0)240     public Object getProperty(String arg0)
241     throws SAXNotRecognizedException, SAXNotSupportedException {
242         throw new SAXNotSupportedException();
243     }
244 
245     /* (non-Javadoc)
246      * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
247      */
parse(InputSource input)248     public void parse(InputSource input) throws IOException, SAXException {
249         Reader reader = getReaderFromInputSource(input);
250         if (reader == null) {
251             throw new IOException("Could not open input source " + input);
252         }
253         BufferedReader inbr = new BufferedReader(reader);
254         parsePatterns(inbr);
255     }
256 
257     /* (non-Javadoc)
258      * @see org.xml.sax.XMLReader#parse(java.lang.String)
259      */
parse(String systemId)260     public void parse(String systemId) throws IOException, SAXException {
261         Reader reader = getReaderFromSystemId(systemId, null);
262         if (reader == null) {
263             throw new IOException("Could not open input systemID " + systemId);
264         }
265         BufferedReader inbr = new BufferedReader(reader);
266         parsePatterns(inbr);
267     }
268 
269     /* (non-Javadoc)
270      * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
271      */
setContentHandler(ContentHandler contenthandler)272     public void setContentHandler(ContentHandler contenthandler) {
273         this.contentHandler = contenthandler;
274     }
275 
276     /* (non-Javadoc)
277      * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
278      */
setDTDHandler(DTDHandler dtdhandler)279     public void setDTDHandler(DTDHandler dtdhandler) {
280         this.dtdHandler = dtdhandler;
281     }
282 
283     /* (non-Javadoc)
284      * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
285      */
setEntityResolver(EntityResolver entityresolver)286     public void setEntityResolver(EntityResolver entityresolver) {
287         this.entityResolver = entityresolver;
288     }
289 
290     /* (non-Javadoc)
291      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
292      */
setErrorHandler(ErrorHandler errorHandler)293     public void setErrorHandler(ErrorHandler errorHandler) {
294         this.errorHandler = errorHandler;
295     }
296 
297 
298     /**
299      * @param lexicalHandler the lexicalHandler to set
300      */
setLexicalHandler(LexicalHandler lexicalHandler)301     public void setLexicalHandler(LexicalHandler lexicalHandler) {
302         this.lexicalHandler = lexicalHandler;
303     }
304 
305     /* (non-Javadoc)
306      * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
307      */
setFeature(String arg0, boolean arg1)308     public void setFeature(String arg0, boolean arg1)
309     throws SAXNotRecognizedException, SAXNotSupportedException {
310         throw new SAXNotSupportedException();
311     }
312 
313     /* (non-Javadoc)
314      * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
315      */
setProperty(String name, Object value)316     public void setProperty(String name, Object value)
317     throws SAXNotRecognizedException, SAXNotSupportedException {
318         if (name.equals("http://xml.org/sax/properties/lexical-handler")) {
319             lexicalHandler = (LexicalHandler) value;
320         } else {
321             throw new SAXNotSupportedException();
322         }
323     }
324 
325 }
326