• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5  * use this file except in compliance with the License. You may obtain a copy of
6  * the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13  * License for the specific language governing permissions and limitations under
14  * the License.
15  */
16 
17 package com.android.inputmethod.latin;
18 
19 import com.android.inputmethod.latin.FusionDictionary.WeightedString;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Writer;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.TreeSet;
27 
28 import javax.xml.parsers.ParserConfigurationException;
29 import javax.xml.parsers.SAXParser;
30 import javax.xml.parsers.SAXParserFactory;
31 
32 import org.xml.sax.Attributes;
33 import org.xml.sax.SAXException;
34 import org.xml.sax.helpers.DefaultHandler;
35 
36 /**
37  * Reads and writes XML files for a FusionDictionary.
38  *
39  * All functions in this class are static.
40  */
41 public class XmlDictInputOutput {
42 
43     private static final String WORD_TAG = "w";
44     private static final String BIGRAM_TAG = "bigram";
45     private static final String FREQUENCY_ATTR = "f";
46     private static final String WORD_ATTR = "word";
47 
48     /**
49      * SAX handler for a unigram XML file.
50      */
51     static private class UnigramHandler extends DefaultHandler {
52         // Parser states
53         private static final int NONE = 0;
54         private static final int START = 1;
55         private static final int WORD = 2;
56         private static final int BIGRAM = 4;
57         private static final int END = 5;
58         private static final int UNKNOWN = 6;
59 
60         final FusionDictionary mDictionary;
61         int mState; // the state of the parser
62         int mFreq; // the currently read freq
63         String mWord; // the current word
64         final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
65 
66         /**
67          * Create the handler.
68          *
69          * @param dict the dictionary to construct.
70          * @param bigrams the bigrams as a map. This may be empty, but may not be null.
71          */
UnigramHandler(FusionDictionary dict, HashMap<String, ArrayList<WeightedString>> bigrams)72         public UnigramHandler(FusionDictionary dict,
73                 HashMap<String, ArrayList<WeightedString>> bigrams) {
74             mDictionary = dict;
75             mBigramsMap = bigrams;
76             mWord = "";
77             mState = START;
78             mFreq = 0;
79         }
80 
81         @Override
startElement(String uri, String localName, String qName, Attributes attrs)82         public void startElement(String uri, String localName, String qName, Attributes attrs) {
83             if (WORD_TAG.equals(localName)) {
84                 mState = WORD;
85                 mWord = "";
86                 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
87                     final String attrName = attrs.getLocalName(attrIndex);
88                     if (FREQUENCY_ATTR.equals(attrName)) {
89                         mFreq = Integer.parseInt(attrs.getValue(attrIndex));
90                     }
91                 }
92             } else {
93                 mState = UNKNOWN;
94             }
95         }
96 
97         @Override
characters(char[] ch, int start, int length)98         public void characters(char[] ch, int start, int length) {
99             if (WORD == mState) {
100                 // The XML parser is free to return text in arbitrary chunks one after the
101                 // other. In particular, this happens in some implementations when it finds
102                 // an escape code like "&amp;".
103                 mWord += String.copyValueOf(ch, start, length);
104             }
105         }
106 
107         @Override
endElement(String uri, String localName, String qName)108         public void endElement(String uri, String localName, String qName) {
109             if (WORD == mState) {
110                 mDictionary.add(mWord, mFreq, mBigramsMap.get(mWord));
111                 mState = START;
112             }
113         }
114     }
115 
116     /**
117      * SAX handler for a bigram XML file.
118      */
119     static private class BigramHandler extends DefaultHandler {
120         private final static String BIGRAM_W1_TAG = "bi";
121         private final static String BIGRAM_W2_TAG = "w";
122         private final static String BIGRAM_W1_ATTRIBUTE = "w1";
123         private final static String BIGRAM_W2_ATTRIBUTE = "w2";
124         private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
125 
126         String mW1;
127         final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
128 
BigramHandler()129         public BigramHandler() {
130             mW1 = null;
131             mBigramsMap = new HashMap<String, ArrayList<WeightedString>>();
132         }
133 
134         @Override
startElement(String uri, String localName, String qName, Attributes attrs)135         public void startElement(String uri, String localName, String qName, Attributes attrs) {
136             if (BIGRAM_W1_TAG.equals(localName)) {
137                 mW1 = attrs.getValue(uri, BIGRAM_W1_ATTRIBUTE);
138             } else if (BIGRAM_W2_TAG.equals(localName)) {
139                 String w2 = attrs.getValue(uri, BIGRAM_W2_ATTRIBUTE);
140                 int freq = Integer.parseInt(attrs.getValue(uri, BIGRAM_FREQ_ATTRIBUTE));
141                 WeightedString bigram = new WeightedString(w2, freq / 8);
142                 ArrayList<WeightedString> bigramList = mBigramsMap.get(mW1);
143                 if (null == bigramList) bigramList = new ArrayList<WeightedString>();
144                 bigramList.add(bigram);
145                 mBigramsMap.put(mW1, bigramList);
146             }
147         }
148 
getBigramMap()149         public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
150             return mBigramsMap;
151         }
152     }
153 
154     /**
155      * Reads a dictionary from an XML file.
156      *
157      * This is the public method that will parse an XML file and return the corresponding memory
158      * representation.
159      *
160      * @param unigrams the file to read the data from.
161      * @return the in-memory representation of the dictionary.
162      */
readDictionaryXml(InputStream unigrams, InputStream bigrams)163     public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams)
164             throws SAXException, IOException, ParserConfigurationException {
165         final SAXParserFactory factory = SAXParserFactory.newInstance();
166         factory.setNamespaceAware(true);
167         final SAXParser parser = factory.newSAXParser();
168         final BigramHandler bigramHandler = new BigramHandler();
169         if (null != bigrams) parser.parse(bigrams, bigramHandler);
170 
171         final FusionDictionary dict = new FusionDictionary();
172         final UnigramHandler unigramHandler =
173                 new UnigramHandler(dict, bigramHandler.getBigramMap());
174         parser.parse(unigrams, unigramHandler);
175         return dict;
176     }
177 
178     /**
179      * Reads a dictionary in the first, legacy XML format
180      *
181      * This method reads data from the parser and creates a new FusionDictionary with it.
182      * The format parsed by this method is the format used before Ice Cream Sandwich,
183      * which has no support for bigrams or shortcuts.
184      * It is important to note that this method expects the parser to have already eaten
185      * the first, all-encompassing tag.
186      *
187      * @param xpp the parser to read the data from.
188      * @return the parsed dictionary.
189      */
190 
191     /**
192      * Writes a dictionary to an XML file.
193      *
194      * The output format is the "second" format, which supports bigrams and shortcuts.
195      *
196      * @param destination a destination stream to write to.
197      * @param dict the dictionary to write.
198      */
writeDictionaryXml(Writer destination, FusionDictionary dict)199     public static void writeDictionaryXml(Writer destination, FusionDictionary dict)
200             throws IOException {
201         final TreeSet<Word> set = new TreeSet<Word>();
202         for (Word word : dict) {
203             set.add(word);
204         }
205         // TODO: use an XMLSerializer if this gets big
206         destination.write("<wordlist format=\"2\">\n");
207         for (Word word : set) {
208             destination.write("  <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
209                     + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">");
210             if (null != word.mBigrams) {
211                 destination.write("\n");
212                 for (WeightedString bigram : word.mBigrams) {
213                     destination.write("    <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\""
214                             + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n");
215                 }
216                 destination.write("  ");
217             }
218             destination.write("</" + WORD_TAG + ">\n");
219         }
220         destination.write("</wordlist>\n");
221         destination.close();
222     }
223 }
224