• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5  * use this file except in compliance with the License. You may obtain a copy of
6  * the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13  * License for the specific language governing permissions and limitations under
14  * the License.
15  */
16 
17 package com.android.inputmethod.latin.dicttool;
18 
19 import com.android.inputmethod.latin.makedict.FormatSpec;
20 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
21 import com.android.inputmethod.latin.makedict.FusionDictionary;
22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
23 import com.android.inputmethod.latin.makedict.ProbabilityInfo;
24 import com.android.inputmethod.latin.makedict.WeightedString;
25 import com.android.inputmethod.latin.makedict.WordProperty;
26 import com.android.inputmethod.latin.utils.CombinedFormatUtils;
27 
28 import java.io.BufferedReader;
29 import java.io.BufferedWriter;
30 import java.io.FileReader;
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.HashMap;
34 import java.util.TreeSet;
35 
36 /**
37  * Reads and writes combined format for a FusionDictionary.
38  *
39  * All functions in this class are static.
40  */
41 public class CombinedInputOutput {
42     private static final String WHITELIST_TAG = "whitelist";
43     private static final String OPTIONS_TAG = "options";
44     private static final String COMMENT_LINE_STARTER = "#";
45     private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3;
46 
47     /**
48      * Basic test to find out whether the file is in the combined format or not.
49      *
50      * Concretely this only tests the header line.
51      *
52      * @param filename The name of the file to test.
53      * @return true if the file is in the combined format, false otherwise
54      */
isCombinedDictionary(final String filename)55     public static boolean isCombinedDictionary(final String filename) {
56         try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) {
57             String firstLine = reader.readLine();
58             while (firstLine.startsWith(COMMENT_LINE_STARTER)) {
59                 firstLine = reader.readLine();
60             }
61             return firstLine.matches(
62                     "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
63         } catch (final IOException e) {
64             return false;
65         }
66     }
67 
68     /**
69      * Reads a dictionary from a combined format file.
70      *
71      * This is the public method that will read a combined file and return the corresponding memory
72      * representation.
73      *
74      * @param reader the buffered reader to read the data from.
75      * @return the in-memory representation of the dictionary.
76      */
readDictionaryCombined(final BufferedReader reader)77     public static FusionDictionary readDictionaryCombined(final BufferedReader reader)
78             throws IOException {
79         String headerLine = reader.readLine();
80         while (headerLine.startsWith(COMMENT_LINE_STARTER)) {
81             headerLine = reader.readLine();
82         }
83         final String header[] = headerLine.split(",");
84         final HashMap<String, String> attributes = new HashMap<>();
85         for (String item : header) {
86             final String keyValue[] = item.split("=");
87             if (2 != keyValue.length) {
88                 throw new RuntimeException("Wrong header format : " + headerLine);
89             }
90             attributes.put(keyValue[0], keyValue[1]);
91         }
92 
93         attributes.remove(OPTIONS_TAG);
94         final FusionDictionary dict =
95                 new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes));
96 
97         String line;
98         String word = null;
99         ProbabilityInfo probabilityInfo = new ProbabilityInfo(0);
100         boolean isNotAWord = false;
101         ArrayList<WeightedString> bigrams = new ArrayList<>();
102         ArrayList<WeightedString> shortcuts = new ArrayList<>();
103         while (null != (line = reader.readLine())) {
104             if (line.startsWith(COMMENT_LINE_STARTER)) continue;
105             final String args[] = line.trim().split(",");
106             if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
107                 if (null != word) {
108                     dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts,
109                             isNotAWord);
110                     for (WeightedString s : bigrams) {
111                         dict.setBigram(word, s.mWord, s.mProbabilityInfo);
112                     }
113                 }
114                 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>();
115                 if (!bigrams.isEmpty()) bigrams = new ArrayList<>();
116                 isNotAWord = false;
117                 for (String param : args) {
118                     final String params[] = param.split("=", 2);
119                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
120                     if (CombinedFormatUtils.WORD_TAG.equals(params[0])) {
121                         word = params[1];
122                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
123                         probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
124                                 probabilityInfo.mTimestamp, probabilityInfo.mLevel,
125                                 probabilityInfo.mCount);
126                     } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
127                         final String[] historicalInfoParams =
128                                 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
129                         if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
130                             throw new RuntimeException("Wrong format (historical info) : " + line);
131                         }
132                         probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability,
133                                 Integer.parseInt(historicalInfoParams[0]),
134                                 Integer.parseInt(historicalInfoParams[1]),
135                                 Integer.parseInt(historicalInfoParams[2]));
136                     } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) {
137                         isNotAWord = "true".equals(params[1]);
138                     }
139                 }
140             } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) {
141                 String shortcut = null;
142                 int shortcutFreq = 0;
143                 for (String param : args) {
144                     final String params[] = param.split("=", 2);
145                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
146                     if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) {
147                         shortcut = params[1];
148                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
149                         shortcutFreq = WHITELIST_TAG.equals(params[1])
150                                 ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY
151                                 : Integer.parseInt(params[1]);
152                     }
153                 }
154                 if (null != shortcut) {
155                     shortcuts.add(new WeightedString(shortcut, shortcutFreq));
156                 } else {
157                     throw new RuntimeException("Wrong format : " + line);
158                 }
159             } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) {
160                 String secondWordOfBigram = null;
161                 ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0);
162                 for (String param : args) {
163                     final String params[] = param.split("=", 2);
164                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
165                     if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) {
166                         secondWordOfBigram = params[1];
167                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
168                         bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
169                                 bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel,
170                                 bigramProbabilityInfo.mCount);
171                     }  else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
172                         final String[] historicalInfoParams =
173                                 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
174                         if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
175                             throw new RuntimeException("Wrong format (historical info) : " + line);
176                         }
177                         bigramProbabilityInfo = new ProbabilityInfo(
178                                 bigramProbabilityInfo.mProbability,
179                                 Integer.parseInt(historicalInfoParams[0]),
180                                 Integer.parseInt(historicalInfoParams[1]),
181                                 Integer.parseInt(historicalInfoParams[2]));
182                     }
183                 }
184                 if (null != secondWordOfBigram) {
185                     bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo));
186                 } else {
187                     throw new RuntimeException("Wrong format : " + line);
188                 }
189             }
190         }
191         if (null != word) {
192             dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
193             for (WeightedString s : bigrams) {
194                 dict.setBigram(word, s.mWord, s.mProbabilityInfo);
195             }
196         }
197 
198         return dict;
199     }
200 
201     /**
202      * Writes a dictionary to a combined file.
203      *
204      * @param destination a destination writer.
205      * @param dict the dictionary to write.
206      */
writeDictionaryCombined(final BufferedWriter destination, final FusionDictionary dict)207     public static void writeDictionaryCombined(final BufferedWriter destination,
208             final FusionDictionary dict) throws IOException {
209         final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
210         for (final WordProperty wordProperty : dict) {
211             // This for ordering by frequency, then by asciibetic order
212             wordPropertiesInDict.add(wordProperty);
213         }
214         destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes));
215         for (final WordProperty wordProperty : wordPropertiesInDict) {
216             destination.write(CombinedFormatUtils.formatWordProperty(wordProperty));
217         }
218     }
219 }
220