1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.dicttool; 18 19 import com.android.inputmethod.latin.makedict.FormatSpec; 20 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 21 import com.android.inputmethod.latin.makedict.FusionDictionary; 22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 23 import com.android.inputmethod.latin.makedict.ProbabilityInfo; 24 import com.android.inputmethod.latin.makedict.WeightedString; 25 import com.android.inputmethod.latin.makedict.WordProperty; 26 import com.android.inputmethod.latin.utils.CombinedFormatUtils; 27 28 import java.io.BufferedReader; 29 import java.io.BufferedWriter; 30 import java.io.FileReader; 31 import java.io.IOException; 32 import java.util.ArrayList; 33 import java.util.HashMap; 34 import java.util.TreeSet; 35 36 /** 37 * Reads and writes combined format for a FusionDictionary. 38 * 39 * All functions in this class are static. 40 */ 41 public class CombinedInputOutput { 42 private static final String WHITELIST_TAG = "whitelist"; 43 private static final String OPTIONS_TAG = "options"; 44 private static final String COMMENT_LINE_STARTER = "#"; 45 private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3; 46 47 /** 48 * Basic test to find out whether the file is in the combined format or not. 49 * 50 * Concretely this only tests the header line. 51 * 52 * @param filename The name of the file to test. 53 * @return true if the file is in the combined format, false otherwise 54 */ isCombinedDictionary(final String filename)55 public static boolean isCombinedDictionary(final String filename) { 56 try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) { 57 String firstLine = reader.readLine(); 58 while (firstLine.startsWith(COMMENT_LINE_STARTER)) { 59 firstLine = reader.readLine(); 60 } 61 return firstLine.matches( 62 "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); 63 } catch (final IOException e) { 64 return false; 65 } 66 } 67 68 /** 69 * Reads a dictionary from a combined format file. 70 * 71 * This is the public method that will read a combined file and return the corresponding memory 72 * representation. 73 * 74 * @param reader the buffered reader to read the data from. 75 * @return the in-memory representation of the dictionary. 76 */ readDictionaryCombined(final BufferedReader reader)77 public static FusionDictionary readDictionaryCombined(final BufferedReader reader) 78 throws IOException { 79 String headerLine = reader.readLine(); 80 while (headerLine.startsWith(COMMENT_LINE_STARTER)) { 81 headerLine = reader.readLine(); 82 } 83 final String header[] = headerLine.split(","); 84 final HashMap<String, String> attributes = new HashMap<>(); 85 for (String item : header) { 86 final String keyValue[] = item.split("="); 87 if (2 != keyValue.length) { 88 throw new RuntimeException("Wrong header format : " + headerLine); 89 } 90 attributes.put(keyValue[0], keyValue[1]); 91 } 92 93 attributes.remove(OPTIONS_TAG); 94 final FusionDictionary dict = 95 new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes)); 96 97 String line; 98 String word = null; 99 ProbabilityInfo probabilityInfo = new ProbabilityInfo(0); 100 boolean isNotAWord = false; 101 boolean isPossiblyOffensive = false; 102 ArrayList<WeightedString> bigrams = new ArrayList<>(); 103 ArrayList<WeightedString> shortcuts = new ArrayList<>(); 104 while (null != (line = reader.readLine())) { 105 if (line.startsWith(COMMENT_LINE_STARTER)) continue; 106 final String args[] = line.trim().split(","); 107 if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { 108 if (null != word) { 109 dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive); 110 for (WeightedString s : bigrams) { 111 dict.setBigram(word, s.mWord, s.mProbabilityInfo); 112 } 113 } 114 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>(); 115 if (!bigrams.isEmpty()) bigrams = new ArrayList<>(); 116 isNotAWord = false; 117 isPossiblyOffensive = false; 118 for (String param : args) { 119 final String params[] = param.split("=", 2); 120 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 121 switch (params[0]) { 122 case CombinedFormatUtils.WORD_TAG: 123 word = params[1]; 124 break; 125 case CombinedFormatUtils.PROBABILITY_TAG: 126 probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), 127 probabilityInfo.mTimestamp, probabilityInfo.mLevel, 128 probabilityInfo.mCount); 129 break; 130 case CombinedFormatUtils.HISTORICAL_INFO_TAG: 131 final String[] historicalInfoParams = params[1].split( 132 CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); 133 if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { 134 throw new RuntimeException("Wrong format (historical info) : " 135 + line); 136 } 137 probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, 138 Integer.parseInt(historicalInfoParams[0]), 139 Integer.parseInt(historicalInfoParams[1]), 140 Integer.parseInt(historicalInfoParams[2])); 141 break; 142 case CombinedFormatUtils.NOT_A_WORD_TAG: 143 isNotAWord = CombinedFormatUtils.isLiteralTrue(params[1]); 144 break; 145 case CombinedFormatUtils.POSSIBLY_OFFENSIVE_TAG: 146 isPossiblyOffensive = CombinedFormatUtils.isLiteralTrue(params[1]); 147 break; 148 } 149 } 150 } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) { 151 String secondWordOfBigram = null; 152 ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0); 153 for (String param : args) { 154 final String params[] = param.split("=", 2); 155 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 156 if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) { 157 secondWordOfBigram = params[1]; 158 } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { 159 bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), 160 bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel, 161 bigramProbabilityInfo.mCount); 162 } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { 163 final String[] historicalInfoParams = 164 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); 165 if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { 166 throw new RuntimeException("Wrong format (historical info) : " + line); 167 } 168 bigramProbabilityInfo = new ProbabilityInfo( 169 bigramProbabilityInfo.mProbability, 170 Integer.parseInt(historicalInfoParams[0]), 171 Integer.parseInt(historicalInfoParams[1]), 172 Integer.parseInt(historicalInfoParams[2])); 173 } 174 } 175 if (null != secondWordOfBigram) { 176 bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo)); 177 } else { 178 throw new RuntimeException("Wrong format : " + line); 179 } 180 } 181 } 182 if (null != word) { 183 dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive); 184 for (WeightedString s : bigrams) { 185 dict.setBigram(word, s.mWord, s.mProbabilityInfo); 186 } 187 } 188 189 return dict; 190 } 191 192 /** 193 * Writes a dictionary to a combined file. 194 * 195 * @param destination a destination writer. 196 * @param dict the dictionary to write. 197 */ writeDictionaryCombined(final BufferedWriter destination, final FusionDictionary dict)198 public static void writeDictionaryCombined(final BufferedWriter destination, 199 final FusionDictionary dict) throws IOException { 200 final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>(); 201 for (final WordProperty wordProperty : dict) { 202 // This for ordering by frequency, then by asciibetic order 203 wordPropertiesInDict.add(wordProperty); 204 } 205 destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes)); 206 for (final WordProperty wordProperty : wordPropertiesInDict) { 207 destination.write(CombinedFormatUtils.formatWordProperty(wordProperty)); 208 } 209 } 210 } 211