• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5  * use this file except in compliance with the License. You may obtain a copy of
6  * the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13  * License for the specific language governing permissions and limitations under
14  * the License.
15  */
16 
17 package com.android.inputmethod.latin.dicttool;
18 
19 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
20 import com.android.inputmethod.latin.makedict.FusionDictionary;
21 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
22 import com.android.inputmethod.latin.makedict.ProbabilityInfo;
23 import com.android.inputmethod.latin.makedict.WeightedString;
24 import com.android.inputmethod.latin.makedict.WordProperty;
25 
26 import org.xml.sax.Attributes;
27 import org.xml.sax.SAXException;
28 import org.xml.sax.helpers.DefaultHandler;
29 
30 import java.io.BufferedInputStream;
31 import java.io.BufferedReader;
32 import java.io.BufferedWriter;
33 import java.io.FileInputStream;
34 import java.io.IOException;
35 import java.io.InputStreamReader;
36 import java.util.ArrayList;
37 import java.util.HashMap;
38 import java.util.TreeSet;
39 
40 import javax.xml.parsers.ParserConfigurationException;
41 import javax.xml.parsers.SAXParser;
42 import javax.xml.parsers.SAXParserFactory;
43 
44 /**
45  * Reads and writes XML files for a FusionDictionary.
46  *
47  * All functions in this class are static.
48  */
49 public class XmlDictInputOutput {
50 
51     private static final String ROOT_TAG = "wordlist";
52     private static final String WORD_TAG = "w";
53     private static final String BIGRAM_TAG = "bigram";
54     private static final String SHORTCUT_TAG = "shortcut";
55     private static final String PROBABILITY_ATTR = "f";
56     private static final String WORD_ATTR = "word";
57     private static final String NOT_A_WORD_ATTR = "not_a_word";
58 
59     /**
60      * SAX handler for a unigram XML file.
61      */
62     static private class UnigramHandler extends DefaultHandler {
63         // Parser states
64         private static final int START = 1;
65         private static final int WORD = 2;
66         private static final int UNKNOWN = 3;
67         private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1;
68 
69         FusionDictionary mDictionary;
70         int mState; // the state of the parser
71         int mFreq; // the currently read freq
72         String mWord; // the current word
73         final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
74 
75         /**
76          * Create the handler.
77          *
78          * @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
79          */
UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts)80         public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
81             mDictionary = null;
82             mShortcutsMap = shortcuts;
83             mWord = "";
84             mState = START;
85             mFreq = 0;
86         }
87 
getFinalDictionary()88         public FusionDictionary getFinalDictionary() {
89             final FusionDictionary dict = mDictionary;
90             for (final String shortcutOnly : mShortcutsMap.keySet()) {
91                 if (dict.hasWord(shortcutOnly)) continue;
92                 dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
93                         mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
94             }
95             mDictionary = null;
96             mShortcutsMap.clear();
97             mWord = "";
98             mState = START;
99             mFreq = 0;
100             return dict;
101         }
102 
103         @Override
startElement(String uri, String localName, String qName, Attributes attrs)104         public void startElement(String uri, String localName, String qName, Attributes attrs) {
105             if (WORD_TAG.equals(localName)) {
106                 mState = WORD;
107                 mWord = "";
108                 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
109                     final String attrName = attrs.getLocalName(attrIndex);
110                     if (PROBABILITY_ATTR.equals(attrName)) {
111                         mFreq = Integer.parseInt(attrs.getValue(attrIndex));
112                     }
113                 }
114             } else if (ROOT_TAG.equals(localName)) {
115                 final HashMap<String, String> attributes = new HashMap<>();
116                 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
117                     final String attrName = attrs.getLocalName(attrIndex);
118                     attributes.put(attrName, attrs.getValue(attrIndex));
119                 }
120                 mDictionary = new FusionDictionary(new PtNodeArray(),
121                         new DictionaryOptions(attributes));
122             } else {
123                 mState = UNKNOWN;
124             }
125         }
126 
127         @Override
characters(char[] ch, int start, int length)128         public void characters(char[] ch, int start, int length) {
129             if (WORD == mState) {
130                 // The XML parser is free to return text in arbitrary chunks one after the
131                 // other. In particular, this happens in some implementations when it finds
132                 // an escape code like "&amp;".
133                 mWord += String.copyValueOf(ch, start, length);
134             }
135         }
136 
137         @Override
endElement(String uri, String localName, String qName)138         public void endElement(String uri, String localName, String qName) {
139             if (WORD == mState) {
140                 mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
141                         false /* isNotAWord */);
142                 mState = START;
143             }
144         }
145     }
146 
147     static private class AssociativeListHandler extends DefaultHandler {
148         private final String SRC_TAG;
149         private final String SRC_ATTRIBUTE;
150         private final String DST_TAG;
151         private final String DST_ATTRIBUTE;
152         private final String DST_FREQ;
153 
154         // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX
155         private final static int XML_MAX = 256;
156         // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX
157         private final static int MEMORY_MAX = 256;
158         private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX;
159 
160         private String mSrc;
161         private final HashMap<String, ArrayList<WeightedString>> mAssocMap;
162 
AssociativeListHandler(final String srcTag, final String srcAttribute, final String dstTag, final String dstAttribute, final String dstFreq)163         public AssociativeListHandler(final String srcTag, final String srcAttribute,
164                 final String dstTag, final String dstAttribute, final String dstFreq) {
165             SRC_TAG = srcTag;
166             SRC_ATTRIBUTE = srcAttribute;
167             DST_TAG = dstTag;
168             DST_ATTRIBUTE = dstAttribute;
169             DST_FREQ = dstFreq;
170             mSrc = null;
171             mAssocMap = new HashMap<>();
172         }
173 
174         @Override
startElement(String uri, String localName, String qName, Attributes attrs)175         public void startElement(String uri, String localName, String qName, Attributes attrs) {
176             if (SRC_TAG.equals(localName)) {
177                 mSrc = attrs.getValue(uri, SRC_ATTRIBUTE);
178             } else if (DST_TAG.equals(localName)) {
179                 String dst = attrs.getValue(uri, DST_ATTRIBUTE);
180                 int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ));
181                 WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO);
182                 ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc);
183                 if (null == bigramList) bigramList = new ArrayList<>();
184                 bigramList.add(bigram);
185                 mAssocMap.put(mSrc, bigramList);
186             }
187         }
188 
getValueFromFreqString(final String freqString)189         protected int getValueFromFreqString(final String freqString) {
190             return Integer.parseInt(freqString);
191         }
192 
193         // This may return an empty map, but will never return null.
getAssocMap()194         public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
195             return mAssocMap;
196         }
197     }
198 
199     /**
200      * SAX handler for a bigram XML file.
201      */
202     static private class BigramHandler extends AssociativeListHandler {
203         private final static String BIGRAM_W1_TAG = "bi";
204         private final static String BIGRAM_W2_TAG = "w";
205         private final static String BIGRAM_W1_ATTRIBUTE = "w1";
206         private final static String BIGRAM_W2_ATTRIBUTE = "w2";
207         private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
208 
BigramHandler()209         public BigramHandler() {
210             super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE,
211                     BIGRAM_FREQ_ATTRIBUTE);
212         }
213 
214         // As per getAssocMap(), this never returns null.
getBigramMap()215         public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
216             return getAssocMap();
217         }
218     }
219 
220     /**
221      * SAX handler for a shortcut & whitelist XML file.
222      */
223     static private class ShortcutAndWhitelistHandler extends AssociativeListHandler {
224         private final static String ENTRY_TAG = "entry";
225         private final static String ENTRY_ATTRIBUTE = "shortcut";
226         private final static String TARGET_TAG = "target";
227         private final static String REPLACEMENT_ATTRIBUTE = "replacement";
228         private final static String TARGET_PRIORITY_ATTRIBUTE = "priority";
229         private final static String WHITELIST_MARKER = "whitelist";
230         private final static int WHITELIST_FREQ_VALUE = 15;
231         private final static int MIN_FREQ = 0;
232         private final static int MAX_FREQ = 14;
233 
ShortcutAndWhitelistHandler()234         public ShortcutAndWhitelistHandler() {
235             super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE,
236                     TARGET_PRIORITY_ATTRIBUTE);
237         }
238 
239         @Override
getValueFromFreqString(final String freqString)240         protected int getValueFromFreqString(final String freqString) {
241             if (WHITELIST_MARKER.equals(freqString)) {
242                 return WHITELIST_FREQ_VALUE;
243             }
244             final int intValue = super.getValueFromFreqString(freqString);
245             if (intValue < MIN_FREQ || intValue > MAX_FREQ) {
246                 throw new RuntimeException("Shortcut freq out of range. Accepted range is "
247                         + MIN_FREQ + ".." + MAX_FREQ);
248             }
249             return intValue;
250         }
251 
252         // As per getAssocMap(), this never returns null.
getShortcutAndWhitelistMap()253         public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() {
254             return getAssocMap();
255         }
256     }
257 
258     /**
259      * Basic test to find out whether the file is in the unigram XML format or not.
260      *
261      * Concretely this only tests the header line.
262      *
263      * @param filename The name of the file to test.
264      * @return true if the file is in the unigram XML format, false otherwise
265      */
isXmlUnigramDictionary(final String filename)266     public static boolean isXmlUnigramDictionary(final String filename) {
267         try (final BufferedReader reader = new BufferedReader(
268                 new InputStreamReader(new FileInputStream(filename), "UTF-8"))) {
269             final String firstLine = reader.readLine();
270             return firstLine.matches("^\\s*<wordlist .*>\\s*$");
271         } catch (final IOException e) {
272             return false;
273         }
274     }
275 
276     /**
277      * Reads a dictionary from an XML file.
278      *
279      * This is the public method that will parse an XML file and return the corresponding memory
280      * representation.
281      *
282      * @param unigrams the file to read the data from.
283      * @param shortcuts the file to read the shortcuts & whitelist from, or null.
284      * @param bigrams the file to read the bigrams from, or null.
285      * @return the in-memory representation of the dictionary.
286      */
readDictionaryXml(final BufferedInputStream unigrams, final BufferedInputStream shortcuts, final BufferedInputStream bigrams)287     public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams,
288             final BufferedInputStream shortcuts, final BufferedInputStream bigrams)
289             throws SAXException, IOException, ParserConfigurationException {
290         final SAXParserFactory factory = SAXParserFactory.newInstance();
291         factory.setNamespaceAware(true);
292         final SAXParser parser = factory.newSAXParser();
293         final BigramHandler bigramHandler = new BigramHandler();
294         if (null != bigrams) parser.parse(bigrams, bigramHandler);
295 
296         final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler =
297                 new ShortcutAndWhitelistHandler();
298         if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler);
299 
300         final UnigramHandler unigramHandler =
301                 new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap());
302         parser.parse(unigrams, unigramHandler);
303         final FusionDictionary dict = unigramHandler.getFinalDictionary();
304         final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
305         for (final String firstWord : bigramMap.keySet()) {
306             if (!dict.hasWord(firstWord)) continue;
307             final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
308             for (final WeightedString bigram : bigramList) {
309                 if (!dict.hasWord(bigram.mWord)) continue;
310                 dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo);
311             }
312         }
313         return dict;
314     }
315 
316     /**
317      * Reads a dictionary in the first, legacy XML format
318      *
319      * This method reads data from the parser and creates a new FusionDictionary with it.
320      * The format parsed by this method is the format used before Ice Cream Sandwich,
321      * which has no support for bigrams or shortcuts/whitelist.
322      * It is important to note that this method expects the parser to have already eaten
323      * the first, all-encompassing tag.
324      *
325      * @param xpp the parser to read the data from.
326      * @return the parsed dictionary.
327      */
328 
329     /**
330      * Writes a dictionary to an XML file.
331      *
332      * The output format is the "second" format, which supports bigrams and shortcuts/whitelist.
333      *
334      * @param destination a destination stream to write to.
335      * @param dict the dictionary to write.
336      */
writeDictionaryXml(final BufferedWriter destination, final FusionDictionary dict)337     public static void writeDictionaryXml(final BufferedWriter destination,
338             final FusionDictionary dict) throws IOException {
339         final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
340         for (WordProperty wordProperty : dict) {
341             wordPropertiesInDict.add(wordProperty);
342         }
343         // TODO: use an XMLSerializer if this gets big
344         destination.write("<wordlist format=\"2\"");
345         for (final String key : dict.mOptions.mAttributes.keySet()) {
346             final String value = dict.mOptions.mAttributes.get(key);
347             destination.write(" " + key + "=\"" + value + "\"");
348         }
349         destination.write(">\n");
350         destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
351         for (WordProperty wordProperty : wordPropertiesInDict) {
352             destination.write("  <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord
353                     + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability()
354                     + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "")
355                     + "\">");
356             if (null != wordProperty.mShortcutTargets) {
357                 destination.write("\n");
358                 for (WeightedString target : wordProperty.mShortcutTargets) {
359                     destination.write("    <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\""
360                             + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG
361                             + ">\n");
362                 }
363                 destination.write("  ");
364             }
365             if (null != wordProperty.mBigrams) {
366                 destination.write("\n");
367                 for (WeightedString bigram : wordProperty.mBigrams) {
368                     destination.write("    <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\""
369                             + bigram.getProbability() + "\">" + bigram.mWord
370                             + "</" + BIGRAM_TAG + ">\n");
371                 }
372                 destination.write("  ");
373             }
374             destination.write("</" + WORD_TAG + ">\n");
375         }
376         destination.write("</wordlist>\n");
377         destination.close();
378     }
379 }
380