1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.dicttool; 18 19 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 20 import com.android.inputmethod.latin.makedict.FusionDictionary; 21 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 22 import com.android.inputmethod.latin.makedict.ProbabilityInfo; 23 import com.android.inputmethod.latin.makedict.WeightedString; 24 import com.android.inputmethod.latin.makedict.WordProperty; 25 26 import org.xml.sax.Attributes; 27 import org.xml.sax.SAXException; 28 import org.xml.sax.helpers.DefaultHandler; 29 30 import java.io.BufferedInputStream; 31 import java.io.BufferedReader; 32 import java.io.BufferedWriter; 33 import java.io.FileInputStream; 34 import java.io.IOException; 35 import java.io.InputStreamReader; 36 import java.util.ArrayList; 37 import java.util.HashMap; 38 import java.util.TreeSet; 39 40 import javax.xml.parsers.ParserConfigurationException; 41 import javax.xml.parsers.SAXParser; 42 import javax.xml.parsers.SAXParserFactory; 43 44 /** 45 * Reads and writes XML files for a FusionDictionary. 46 * 47 * All functions in this class are static. 48 */ 49 public class XmlDictInputOutput { 50 51 private static final String ROOT_TAG = "wordlist"; 52 private static final String WORD_TAG = "w"; 53 private static final String BIGRAM_TAG = "bigram"; 54 private static final String SHORTCUT_TAG = "shortcut"; 55 private static final String PROBABILITY_ATTR = "f"; 56 private static final String WORD_ATTR = "word"; 57 private static final String NOT_A_WORD_ATTR = "not_a_word"; 58 59 /** 60 * SAX handler for a unigram XML file. 61 */ 62 static private class UnigramHandler extends DefaultHandler { 63 // Parser states 64 private static final int START = 1; 65 private static final int WORD = 2; 66 private static final int UNKNOWN = 3; 67 private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1; 68 69 FusionDictionary mDictionary; 70 int mState; // the state of the parser 71 int mFreq; // the currently read freq 72 String mWord; // the current word 73 final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; 74 75 /** 76 * Create the handler. 77 * 78 * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. 79 */ UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts)80 public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) { 81 mDictionary = null; 82 mShortcutsMap = shortcuts; 83 mWord = ""; 84 mState = START; 85 mFreq = 0; 86 } 87 getFinalDictionary()88 public FusionDictionary getFinalDictionary() { 89 final FusionDictionary dict = mDictionary; 90 for (final String shortcutOnly : mShortcutsMap.keySet()) { 91 if (dict.hasWord(shortcutOnly)) continue; 92 dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY), 93 mShortcutsMap.get(shortcutOnly), true /* isNotAWord */); 94 } 95 mDictionary = null; 96 mShortcutsMap.clear(); 97 mWord = ""; 98 mState = START; 99 mFreq = 0; 100 return dict; 101 } 102 103 @Override startElement(String uri, String localName, String qName, Attributes attrs)104 public void startElement(String uri, String localName, String qName, Attributes attrs) { 105 if (WORD_TAG.equals(localName)) { 106 mState = WORD; 107 mWord = ""; 108 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { 109 final String attrName = attrs.getLocalName(attrIndex); 110 if (PROBABILITY_ATTR.equals(attrName)) { 111 mFreq = Integer.parseInt(attrs.getValue(attrIndex)); 112 } 113 } 114 } else if (ROOT_TAG.equals(localName)) { 115 final HashMap<String, String> attributes = new HashMap<>(); 116 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { 117 final String attrName = attrs.getLocalName(attrIndex); 118 attributes.put(attrName, attrs.getValue(attrIndex)); 119 } 120 mDictionary = new FusionDictionary(new PtNodeArray(), 121 new DictionaryOptions(attributes)); 122 } else { 123 mState = UNKNOWN; 124 } 125 } 126 127 @Override characters(char[] ch, int start, int length)128 public void characters(char[] ch, int start, int length) { 129 if (WORD == mState) { 130 // The XML parser is free to return text in arbitrary chunks one after the 131 // other. In particular, this happens in some implementations when it finds 132 // an escape code like "&". 133 mWord += String.copyValueOf(ch, start, length); 134 } 135 } 136 137 @Override endElement(String uri, String localName, String qName)138 public void endElement(String uri, String localName, String qName) { 139 if (WORD == mState) { 140 mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord), 141 false /* isNotAWord */); 142 mState = START; 143 } 144 } 145 } 146 147 static private class AssociativeListHandler extends DefaultHandler { 148 private final String SRC_TAG; 149 private final String SRC_ATTRIBUTE; 150 private final String DST_TAG; 151 private final String DST_ATTRIBUTE; 152 private final String DST_FREQ; 153 154 // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX 155 private final static int XML_MAX = 256; 156 // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX 157 private final static int MEMORY_MAX = 256; 158 private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; 159 160 private String mSrc; 161 private final HashMap<String, ArrayList<WeightedString>> mAssocMap; 162 AssociativeListHandler(final String srcTag, final String srcAttribute, final String dstTag, final String dstAttribute, final String dstFreq)163 public AssociativeListHandler(final String srcTag, final String srcAttribute, 164 final String dstTag, final String dstAttribute, final String dstFreq) { 165 SRC_TAG = srcTag; 166 SRC_ATTRIBUTE = srcAttribute; 167 DST_TAG = dstTag; 168 DST_ATTRIBUTE = dstAttribute; 169 DST_FREQ = dstFreq; 170 mSrc = null; 171 mAssocMap = new HashMap<>(); 172 } 173 174 @Override startElement(String uri, String localName, String qName, Attributes attrs)175 public void startElement(String uri, String localName, String qName, Attributes attrs) { 176 if (SRC_TAG.equals(localName)) { 177 mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); 178 } else if (DST_TAG.equals(localName)) { 179 String dst = attrs.getValue(uri, DST_ATTRIBUTE); 180 int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ)); 181 WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); 182 ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); 183 if (null == bigramList) bigramList = new ArrayList<>(); 184 bigramList.add(bigram); 185 mAssocMap.put(mSrc, bigramList); 186 } 187 } 188 getValueFromFreqString(final String freqString)189 protected int getValueFromFreqString(final String freqString) { 190 return Integer.parseInt(freqString); 191 } 192 193 // This may return an empty map, but will never return null. getAssocMap()194 public HashMap<String, ArrayList<WeightedString>> getAssocMap() { 195 return mAssocMap; 196 } 197 } 198 199 /** 200 * SAX handler for a bigram XML file. 201 */ 202 static private class BigramHandler extends AssociativeListHandler { 203 private final static String BIGRAM_W1_TAG = "bi"; 204 private final static String BIGRAM_W2_TAG = "w"; 205 private final static String BIGRAM_W1_ATTRIBUTE = "w1"; 206 private final static String BIGRAM_W2_ATTRIBUTE = "w2"; 207 private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; 208 BigramHandler()209 public BigramHandler() { 210 super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, 211 BIGRAM_FREQ_ATTRIBUTE); 212 } 213 214 // As per getAssocMap(), this never returns null. getBigramMap()215 public HashMap<String, ArrayList<WeightedString>> getBigramMap() { 216 return getAssocMap(); 217 } 218 } 219 220 /** 221 * SAX handler for a shortcut & whitelist XML file. 222 */ 223 static private class ShortcutAndWhitelistHandler extends AssociativeListHandler { 224 private final static String ENTRY_TAG = "entry"; 225 private final static String ENTRY_ATTRIBUTE = "shortcut"; 226 private final static String TARGET_TAG = "target"; 227 private final static String REPLACEMENT_ATTRIBUTE = "replacement"; 228 private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; 229 private final static String WHITELIST_MARKER = "whitelist"; 230 private final static int WHITELIST_FREQ_VALUE = 15; 231 private final static int MIN_FREQ = 0; 232 private final static int MAX_FREQ = 14; 233 ShortcutAndWhitelistHandler()234 public ShortcutAndWhitelistHandler() { 235 super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, 236 TARGET_PRIORITY_ATTRIBUTE); 237 } 238 239 @Override getValueFromFreqString(final String freqString)240 protected int getValueFromFreqString(final String freqString) { 241 if (WHITELIST_MARKER.equals(freqString)) { 242 return WHITELIST_FREQ_VALUE; 243 } 244 final int intValue = super.getValueFromFreqString(freqString); 245 if (intValue < MIN_FREQ || intValue > MAX_FREQ) { 246 throw new RuntimeException("Shortcut freq out of range. Accepted range is " 247 + MIN_FREQ + ".." + MAX_FREQ); 248 } 249 return intValue; 250 } 251 252 // As per getAssocMap(), this never returns null. getShortcutAndWhitelistMap()253 public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() { 254 return getAssocMap(); 255 } 256 } 257 258 /** 259 * Basic test to find out whether the file is in the unigram XML format or not. 260 * 261 * Concretely this only tests the header line. 262 * 263 * @param filename The name of the file to test. 264 * @return true if the file is in the unigram XML format, false otherwise 265 */ isXmlUnigramDictionary(final String filename)266 public static boolean isXmlUnigramDictionary(final String filename) { 267 try (final BufferedReader reader = new BufferedReader( 268 new InputStreamReader(new FileInputStream(filename), "UTF-8"))) { 269 final String firstLine = reader.readLine(); 270 return firstLine.matches("^\\s*<wordlist .*>\\s*$"); 271 } catch (final IOException e) { 272 return false; 273 } 274 } 275 276 /** 277 * Reads a dictionary from an XML file. 278 * 279 * This is the public method that will parse an XML file and return the corresponding memory 280 * representation. 281 * 282 * @param unigrams the file to read the data from. 283 * @param shortcuts the file to read the shortcuts & whitelist from, or null. 284 * @param bigrams the file to read the bigrams from, or null. 285 * @return the in-memory representation of the dictionary. 286 */ readDictionaryXml(final BufferedInputStream unigrams, final BufferedInputStream shortcuts, final BufferedInputStream bigrams)287 public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams, 288 final BufferedInputStream shortcuts, final BufferedInputStream bigrams) 289 throws SAXException, IOException, ParserConfigurationException { 290 final SAXParserFactory factory = SAXParserFactory.newInstance(); 291 factory.setNamespaceAware(true); 292 final SAXParser parser = factory.newSAXParser(); 293 final BigramHandler bigramHandler = new BigramHandler(); 294 if (null != bigrams) parser.parse(bigrams, bigramHandler); 295 296 final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler = 297 new ShortcutAndWhitelistHandler(); 298 if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler); 299 300 final UnigramHandler unigramHandler = 301 new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap()); 302 parser.parse(unigrams, unigramHandler); 303 final FusionDictionary dict = unigramHandler.getFinalDictionary(); 304 final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap(); 305 for (final String firstWord : bigramMap.keySet()) { 306 if (!dict.hasWord(firstWord)) continue; 307 final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); 308 for (final WeightedString bigram : bigramList) { 309 if (!dict.hasWord(bigram.mWord)) continue; 310 dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo); 311 } 312 } 313 return dict; 314 } 315 316 /** 317 * Reads a dictionary in the first, legacy XML format 318 * 319 * This method reads data from the parser and creates a new FusionDictionary with it. 320 * The format parsed by this method is the format used before Ice Cream Sandwich, 321 * which has no support for bigrams or shortcuts/whitelist. 322 * It is important to note that this method expects the parser to have already eaten 323 * the first, all-encompassing tag. 324 * 325 * @param xpp the parser to read the data from. 326 * @return the parsed dictionary. 327 */ 328 329 /** 330 * Writes a dictionary to an XML file. 331 * 332 * The output format is the "second" format, which supports bigrams and shortcuts/whitelist. 333 * 334 * @param destination a destination stream to write to. 335 * @param dict the dictionary to write. 336 */ writeDictionaryXml(final BufferedWriter destination, final FusionDictionary dict)337 public static void writeDictionaryXml(final BufferedWriter destination, 338 final FusionDictionary dict) throws IOException { 339 final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>(); 340 for (WordProperty wordProperty : dict) { 341 wordPropertiesInDict.add(wordProperty); 342 } 343 // TODO: use an XMLSerializer if this gets big 344 destination.write("<wordlist format=\"2\""); 345 for (final String key : dict.mOptions.mAttributes.keySet()) { 346 final String value = dict.mOptions.mAttributes.get(key); 347 destination.write(" " + key + "=\"" + value + "\""); 348 } 349 destination.write(">\n"); 350 destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); 351 for (WordProperty wordProperty : wordPropertiesInDict) { 352 destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord 353 + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability() 354 + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") 355 + "\">"); 356 if (null != wordProperty.mShortcutTargets) { 357 destination.write("\n"); 358 for (WeightedString target : wordProperty.mShortcutTargets) { 359 destination.write(" <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\"" 360 + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG 361 + ">\n"); 362 } 363 destination.write(" "); 364 } 365 if (null != wordProperty.mBigrams) { 366 destination.write("\n"); 367 for (WeightedString bigram : wordProperty.mBigrams) { 368 destination.write(" <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\"" 369 + bigram.getProbability() + "\">" + bigram.mWord 370 + "</" + BIGRAM_TAG + ">\n"); 371 } 372 destination.write(" "); 373 } 374 destination.write("</" + WORD_TAG + ">\n"); 375 } 376 destination.write("</wordlist>\n"); 377 destination.close(); 378 } 379 } 380