• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright Simon Pepping 2009
3  *
4  * The copyright owner licenses this file to You under the Apache License, Version 2.0
5  * (the "License"); you may not use this file except in compliance with
6  * the License.  You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /* $Id$ */
18 
19 package org.tug.texhyphen;
20 
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.FileInputStream;
24 import java.io.FileReader;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.InputStreamReader;
28 import java.io.Reader;
29 import java.net.URI;
30 import java.net.URISyntaxException;
31 import java.net.URL;
32 import java.net.URLConnection;
33 import java.util.Collection;
34 import java.util.Iterator;
35 import java.util.List;
36 import java.util.Vector;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39 
40 import org.xml.sax.Attributes;
41 import org.xml.sax.ContentHandler;
42 import org.xml.sax.DTDHandler;
43 import org.xml.sax.EntityResolver;
44 import org.xml.sax.ErrorHandler;
45 import org.xml.sax.InputSource;
46 import org.xml.sax.SAXException;
47 import org.xml.sax.SAXNotRecognizedException;
48 import org.xml.sax.SAXNotSupportedException;
49 import org.xml.sax.XMLReader;
50 import org.xml.sax.ext.LexicalHandler;
51 import org.xml.sax.helpers.AttributesImpl;
52 
53 /**
54  * The class TeXParser parses TeX hyphenation pattern files and produces SAX events
55  */
56 public class LanguageDataParser implements XMLReader {
57 
58     public static final String LANG_NAMESPACE = "urn:org:tug:texhyphen:languagedata";
59     public static int lineLength = 72;
60     private static final int TOP_LEVEL = 3, IN_LANG = 4;
61     private static final Pattern
62     comment = Pattern.compile("#.*"),
63     langStart = Pattern.compile("{", Pattern.LITERAL),
64     langEnd = Pattern.compile("}", Pattern.LITERAL),
65     dataline = Pattern.compile("\"([^\"]+)\" ?=> ?\"([^\"]+)\","),
66     keywordline = Pattern.compile("\"([^\"]+)\" ?=> ?(false|true|nil),"),
67     listline = Pattern.compile("\"([^\"]+)\" ?=> ?\\[(\"[^\"]+\"(?:,\"[^\"]+\")*)\\],"),
68     datalistline = Pattern.compile("\"([^\"]+)\" ?=> ?\\[([^,]+(?:,[^,]+)*)\\],"),
69     space = Pattern.compile("[ \\t]+");
70     private static final AttributesImpl emptyAtts = new AttributesImpl();
71 
72     private ContentHandler contentHandler;
73     private DTDHandler dtdHandler;
74     private EntityResolver entityResolver;
75     private ErrorHandler errorHandler;
76     private LexicalHandler lexicalHandler;
77 
parseLanguageData(BufferedReader inbr)78     private void parseLanguageData(BufferedReader inbr) throws SAXException, IOException {
79         int parseState = TOP_LEVEL;
80         Language lang = null;
81 
82         contentHandler.startDocument();
83         contentHandler.startPrefixMapping("", LANG_NAMESPACE);
84         contentHandler.startElement(LANG_NAMESPACE, "languages", "languages", emptyAtts);
85 
86         for (String line = inbr.readLine(); line != null; line = inbr.readLine()) {
87             Matcher matcher = comment.matcher(line).useAnchoringBounds(true);
88             int start = 0;
89             while (start < line.length()) {
90                 if (matcher.usePattern(comment).lookingAt()) {
91                     processComment(matcher.group(), parseState == TOP_LEVEL ? null : lang);
92                 } else if (matcher.usePattern(space).lookingAt()) {
93                     // do nothing
94                 } else if (parseState == TOP_LEVEL && matcher.usePattern(langStart).lookingAt()) {
95                     parseState = IN_LANG;
96                     lang = new Language();
97                 } else if ((parseState == IN_LANG) && matcher.usePattern(langEnd).lookingAt()) {
98                     pushoutLanguage(lang);
99                     lang = null;
100                     parseState = TOP_LEVEL;
101                 } else if (parseState == IN_LANG
102                         && (matcher.usePattern(dataline).lookingAt()
103                                 || matcher.usePattern(keywordline).lookingAt())) {
104                     String key = matcher.group(1);
105                     String value = matcher.group(2);
106                     processDataline(key, value, lang);
107                 } else if (parseState == IN_LANG
108                         && (matcher.usePattern(listline).lookingAt()
109                                 || matcher.usePattern(datalistline).lookingAt())) {
110                     String key = matcher.group(1);
111                     String values = matcher.group(2);
112                     processListline(key, values, lang);
113                 } else {
114                     break;
115                 }
116                 start = matcher.end();
117                 matcher = matcher.region(start, line.length()).useAnchoringBounds(true);
118             }
119         }
120 
121         contentHandler.endElement(LANG_NAMESPACE, "languages", "languages");
122         contentHandler.endPrefixMapping(LANG_NAMESPACE);
123         contentHandler.endDocument();
124     }
125 
126     static Collection<String> attributeKeys;
127     static {
128         attributeKeys = new Vector<String>();
129         attributeKeys.add("code");
130         attributeKeys.add("name");
131         attributeKeys.add("use-old-patterns");
132         attributeKeys.add("use-new-loader");
133         attributeKeys.add("encoding");
134         attributeKeys.add("exceptions");
135     }
136 
processComment(String comment, Language lang)137     private void processComment(String comment, Language lang) throws SAXException {
138         comment = comment.replace("--", "––");
139         if (!comment.endsWith(" ")) {
140             comment = comment + " ";
141         }
142         if (lang == null) {
143             char[] textchars = comment.toCharArray();
144             if (lexicalHandler != null) {
145                 lexicalHandler.comment(textchars, 1, textchars.length - 1);
146             }
147         } else {
148             lang.elements.add(new Element("comment", comment));
149         }
150 
151     }
152 
processDataline(String key, String value, Language lang)153     private void processDataline(String key, String value, Language lang) {
154         key = key.replace('_', '-');
155         if (value.equals("nil")) {
156             value = "";
157         }
158         if (attributeKeys.contains(key)) {
159             lang.atts.addAttribute("", key, key, "CDATA", value);
160         } else {
161             lang.elements.add(new Element(key, value));
162         }
163     }
164 
processListline(String key, String valuesString, Language lang)165     private void processListline(String key, String valuesString, Language lang) {
166         key = key.replace('_', '-');
167         valuesString = valuesString.replace("\"", "");
168         String[] values = valuesString.split(",[ \\t]*");
169         if (attributeKeys.contains(key)) {
170             StringBuilder attValue = new StringBuilder();
171             for (String value : values) {
172                 if (!value.equals("nil")) {
173                     attValue.append(" " + value);
174                 }
175             }
176             lang.atts.addAttribute("", key, key, "CDATA", attValue.toString());
177         } else if (key.equals("hyphenmin")) {
178             key = "hyphen-min";
179             AttributesImpl atts = new AttributesImpl();
180             atts.addAttribute("", "before", "before", "CDATA", values[0]);
181             atts.addAttribute("", "after", "after", "CDATA", values[1]);
182             lang.elements.add(new Element(key, "", atts));
183         } else {
184             key = key.replaceAll("s$", "");
185             for (String value : values) {
186             if (value.equals("nil")) {
187                     value = "";
188                 }
189                 lang.elements.add(new Element(key, value));
190             }
191         }
192     }
193 
pushoutLanguage(Language lang)194     private void pushoutLanguage(Language lang) throws SAXException {
195         contentHandler.startElement(LANG_NAMESPACE, "language", "language", lang.atts);
196         Iterator<Element> iter = lang.elements.iterator();
197         while (iter.hasNext()) {
198             Element elt = iter.next();
199             char[] text = elt.content.toCharArray();
200             if (elt.tag.equals("comment")) {
201                 if (lexicalHandler != null) {
202                     lexicalHandler.comment(text, 1, text.length - 1);
203                 }
204             } else {
205                 contentHandler.startElement(LANG_NAMESPACE, elt.tag, elt.tag, elt.atts);
206                 contentHandler.characters(text, 0, text.length);
207                 contentHandler.endElement(LANG_NAMESPACE, elt.tag, elt.tag);
208             }
209         }
210         contentHandler.endElement(LANG_NAMESPACE, "language", "language");
211     }
212 
getReaderFromInputSource(InputSource input)213     public Reader getReaderFromInputSource(InputSource input) throws IOException {
214         Reader reader = input.getCharacterStream();
215         String encoding = null;
216         if (reader == null) {
217             encoding = input.getEncoding();
218         }
219         if (reader == null) {
220             InputStream stream = input.getByteStream();
221             if (stream != null) {
222                 if (encoding == null) {
223                     reader = new InputStreamReader(stream);
224                 } else {
225                     reader = new InputStreamReader(stream, encoding);
226                 }
227             }
228         }
229         if (reader == null) {
230             String systemId = input.getSystemId();
231             reader = getReaderFromSystemId(systemId, encoding);
232         }
233         return reader;
234     }
235 
getReaderFromSystemId(String systemId, String encoding)236     public Reader getReaderFromSystemId(String systemId, String encoding) throws IOException {
237         if (systemId == null) {
238             throw new IOException("Cannot create a reader from a null systemID");
239         }
240         if (encoding.isEmpty()) {
241             encoding = null;
242         }
243         Reader reader = null;
244         URI uri = null;
245         File file = null;
246         try {
247             uri = new URI(systemId);
248         } catch (URISyntaxException e) {
249             // handled below
250         }
251         if (uri == null || !uri.isAbsolute()) {
252             file = new File(systemId);
253         }
254         if (file != null) {
255             if (encoding == null) {
256                 reader = new FileReader(file);
257             } else {
258                 InputStream stream = new FileInputStream(file);
259                 reader = new InputStreamReader(stream, encoding);
260             }
261         } else if (uri != null && uri.getScheme().equals("http")) {
262             URL url = uri.toURL();
263             URLConnection conn = url.openConnection();
264             if (encoding == null) {
265                 encoding = conn.getContentEncoding();
266             }
267             InputStream stream = conn.getInputStream();
268             reader = new InputStreamReader(stream, encoding);
269         }
270         return reader;
271     }
272 
273     /* (non-Javadoc)
274      * @see org.xml.sax.XMLReader#getContentHandler()
275      */
getContentHandler()276     public ContentHandler getContentHandler() {
277         return contentHandler;
278     }
279 
280     /* (non-Javadoc)
281      * @see org.xml.sax.XMLReader#getDTDHandler()
282      */
getDTDHandler()283     public DTDHandler getDTDHandler() {
284         return dtdHandler;
285     }
286 
287     /* (non-Javadoc)
288      * @see org.xml.sax.XMLReader#getEntityResolver()
289      */
getEntityResolver()290     public EntityResolver getEntityResolver() {
291         return entityResolver;
292     }
293 
294     /* (non-Javadoc)
295      * @see org.xml.sax.XMLReader#getErrorHandler()
296      */
getErrorHandler()297     public ErrorHandler getErrorHandler() {
298         return errorHandler;
299     }
300 
301 
302     /**
303      * @return the lexicalHandler
304      */
getLexicalHandler()305     public LexicalHandler getLexicalHandler() {
306         return lexicalHandler;
307     }
308 
309     /* (non-Javadoc)
310      * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
311      */
getFeature(String arg0)312     public boolean getFeature(String arg0)
313     throws SAXNotRecognizedException, SAXNotSupportedException {
314         throw new SAXNotSupportedException();
315     }
316 
317     /* (non-Javadoc)
318      * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
319      */
getProperty(String arg0)320     public Object getProperty(String arg0)
321     throws SAXNotRecognizedException, SAXNotSupportedException {
322         throw new SAXNotSupportedException();
323     }
324 
325     /* (non-Javadoc)
326      * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
327      */
parse(InputSource input)328     public void parse(InputSource input) throws IOException, SAXException {
329         Reader reader = getReaderFromInputSource(input);
330         if (reader == null) {
331             throw new IOException("Could not open input source " + input);
332         }
333         BufferedReader inbr = new BufferedReader(reader);
334         parseLanguageData(inbr);
335     }
336 
337     /* (non-Javadoc)
338      * @see org.xml.sax.XMLReader#parse(java.lang.String)
339      */
parse(String systemId)340     public void parse(String systemId) throws IOException, SAXException {
341         Reader reader = getReaderFromSystemId(systemId, null);
342         if (reader == null) {
343             throw new IOException("Could not open input systemID " + systemId);
344         }
345         BufferedReader inbr = new BufferedReader(reader);
346         parseLanguageData(inbr);
347     }
348 
349     /* (non-Javadoc)
350      * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
351      */
setContentHandler(ContentHandler contenthandler)352     public void setContentHandler(ContentHandler contenthandler) {
353         this.contentHandler = contenthandler;
354     }
355 
356     /* (non-Javadoc)
357      * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
358      */
setDTDHandler(DTDHandler dtdhandler)359     public void setDTDHandler(DTDHandler dtdhandler) {
360         this.dtdHandler = dtdhandler;
361     }
362 
363     /* (non-Javadoc)
364      * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
365      */
setEntityResolver(EntityResolver entityresolver)366     public void setEntityResolver(EntityResolver entityresolver) {
367         this.entityResolver = entityresolver;
368     }
369 
370     /* (non-Javadoc)
371      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
372      */
setErrorHandler(ErrorHandler errorHandler)373     public void setErrorHandler(ErrorHandler errorHandler) {
374         this.errorHandler = errorHandler;
375     }
376 
377 
378     /**
379      * @param lexicalHandler the lexicalHandler to set
380      */
setLexicalHandler(LexicalHandler lexicalHandler)381     public void setLexicalHandler(LexicalHandler lexicalHandler) {
382         this.lexicalHandler = lexicalHandler;
383     }
384 
385     /* (non-Javadoc)
386      * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
387      */
setFeature(String arg0, boolean arg1)388     public void setFeature(String arg0, boolean arg1)
389     throws SAXNotRecognizedException, SAXNotSupportedException {
390         throw new SAXNotSupportedException();
391     }
392 
393     /* (non-Javadoc)
394      * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
395      */
setProperty(String name, Object value)396     public void setProperty(String name, Object value)
397     throws SAXNotRecognizedException, SAXNotSupportedException {
398         if (name.equals("http://xml.org/sax/properties/lexical-handler")) {
399             lexicalHandler = (LexicalHandler) value;
400         } else {
401             throw new SAXNotSupportedException();
402         }
403     }
404 
405     private static class Element {
406         String tag;
407         String content;
408         Attributes atts;
Element(String tag, String content)409         Element(String tag, String content) {
410             this(tag, content, LanguageDataParser.emptyAtts);
411         }
Element(String tag, String content, Attributes atts)412         Element(String tag, String content, Attributes atts) {
413             this.tag = tag;
414             this.content = content;
415             this.atts = atts;
416         }
417     }
418 
419     private static class Language {
420         AttributesImpl atts;
421         List<Element> elements;
Language()422         Language() {
423             atts = new AttributesImpl();
424             elements = new Vector<Element>();
425         }
426     }
427 
428 }
429