1 /*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 ** http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 #include <string.h>
22 #include <cutils/log.h>
23
24 #include <unicode/uchar.h>
25
26 //#define USE_ASSET_MANAGER
27
28 #ifdef USE_ASSET_MANAGER
29 #include <utils/AssetManager.h>
30 #include <utils/Asset.h>
31 #endif
32
33 #include "dictionary.h"
34 #include "basechars.h"
35
36 #define DEBUG_DICT 0
37
38 namespace latinime {
39
Dictionary(void * dict,int typedLetterMultiplier,int fullWordMultiplier)40 Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
41 {
42 mDict = (unsigned char*) dict;
43 mTypedLetterMultiplier = typedLetterMultiplier;
44 mFullWordMultiplier = fullWordMultiplier;
45 }
46
~Dictionary()47 Dictionary::~Dictionary()
48 {
49 }
50
getSuggestions(int * codes,int codesSize,unsigned short * outWords,int * frequencies,int maxWordLength,int maxWords,int maxAlternatives)51 int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
52 int maxWordLength, int maxWords, int maxAlternatives)
53 {
54 memset(frequencies, 0, maxWords * sizeof(*frequencies));
55 memset(outWords, 0, maxWords * maxWordLength * sizeof(*outWords));
56
57 mFrequencies = frequencies;
58 mOutputChars = outWords;
59 mInputCodes = codes;
60 mInputLength = codesSize;
61 mMaxAlternatives = maxAlternatives;
62 mMaxWordLength = maxWordLength;
63 mMaxWords = maxWords;
64 mWords = 0;
65
66 getWordsRec(0, 0, mInputLength * 3, false, 1, 0);
67
68 if (DEBUG_DICT) LOGI("Returning %d words", mWords);
69 return mWords;
70 }
71
72 unsigned short
getChar(int * pos)73 Dictionary::getChar(int *pos)
74 {
75 unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
76 // If the code is 255, then actual 16 bit code follows (in big endian)
77 if (ch == 0xFF) {
78 ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
79 (*pos) += 2;
80 }
81 return ch;
82 }
83
84 int
getAddress(int * pos)85 Dictionary::getAddress(int *pos)
86 {
87 int address = 0;
88 if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
89 *pos += 1;
90 } else {
91 address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
92 address += (mDict[*pos + 1] & 0xFF) << 8;
93 address += (mDict[*pos + 2] & 0xFF);
94 *pos += 3;
95 }
96 return address;
97 }
98
99 int
wideStrLen(unsigned short * str)100 Dictionary::wideStrLen(unsigned short *str)
101 {
102 if (!str) return 0;
103 unsigned short *end = str;
104 while (*end)
105 end++;
106 return end - str;
107 }
108
109 bool
addWord(unsigned short * word,int length,int frequency)110 Dictionary::addWord(unsigned short *word, int length, int frequency)
111 {
112 word[length] = 0;
113 if (DEBUG_DICT) LOGI("Found word = %s, freq = %d : \n", word, frequency);
114
115 // Find the right insertion point
116 int insertAt = 0;
117 while (insertAt < mMaxWords) {
118 if (frequency > mFrequencies[insertAt]
119 || (mFrequencies[insertAt] == frequency
120 && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
121 break;
122 }
123 insertAt++;
124 }
125 if (insertAt < mMaxWords) {
126 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
127 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
128 (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
129 mFrequencies[insertAt] = frequency;
130 memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
131 (char*) mOutputChars + (insertAt ) * mMaxWordLength * sizeof(short),
132 (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
133 unsigned short *dest = mOutputChars + (insertAt ) * mMaxWordLength;
134 while (length--) {
135 *dest++ = *word++;
136 }
137 *dest = 0; // NULL terminate
138 // Update the word count
139 if (insertAt + 1 > mWords) mWords = insertAt + 1;
140 if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
141 return true;
142 }
143 return false;
144 }
145
146 unsigned short
toLowerCase(unsigned short c,const int depth)147 Dictionary::toLowerCase(unsigned short c, const int depth) {
148 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
149 c = BASE_CHARS[c];
150 }
151 if (depth == 0) {
152 if (c >='A' && c <= 'Z') {
153 c |= 32;
154 } else if (c > 127) {
155 c = u_tolower(c);
156 }
157 }
158 return c;
159 }
160
161 bool
sameAsTyped(unsigned short * word,int length)162 Dictionary::sameAsTyped(unsigned short *word, int length)
163 {
164 if (length != mInputLength) {
165 return false;
166 }
167 int *inputCodes = mInputCodes;
168 while (length--) {
169 if ((unsigned int) *inputCodes != (unsigned int) *word) {
170 return false;
171 }
172 inputCodes += mMaxAlternatives;
173 word++;
174 }
175 return true;
176 }
177
178 static char QUOTE = '\'';
179
180 void
getWordsRec(int pos,int depth,int maxDepth,bool completion,int snr,int inputIndex)181 Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex)
182 {
183 // Optimization: Prune out words that are too long compared to how much was typed.
184 if (depth > maxDepth) {
185 return;
186 }
187 int count = getCount(&pos);
188 int *currentChars = NULL;
189 if (mInputLength <= inputIndex) {
190 completion = true;
191 } else {
192 currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
193 }
194
195 for (int i = 0; i < count; i++) {
196 unsigned short c = getChar(&pos);
197 unsigned short lowerC = toLowerCase(c, depth);
198 bool terminal = getTerminal(&pos);
199 int childrenAddress = getAddress(&pos);
200 int freq = 1;
201 if (terminal) freq = getFreq(&pos);
202 // If we are only doing completions, no need to look at the typed characters.
203 if (completion) {
204 mWord[depth] = c;
205 if (terminal) {
206 addWord(mWord, depth + 1, freq * snr);
207 }
208 if (childrenAddress != 0) {
209 getWordsRec(childrenAddress, depth + 1, maxDepth,
210 completion, snr, inputIndex);
211 }
212 } else if (c == QUOTE && currentChars[0] != QUOTE) {
213 // Skip the ' and continue deeper
214 mWord[depth] = QUOTE;
215 if (childrenAddress != 0) {
216 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex);
217 }
218 } else {
219 int j = 0;
220 while (currentChars[j] > 0) {
221 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
222 if (currentChars[j] == lowerC || currentChars[j] == c) {
223 mWord[depth] = c;
224 if (mInputLength == inputIndex + 1) {
225 if (terminal) {
226 if (//INCLUDE_TYPED_WORD_IF_VALID ||
227 !sameAsTyped(mWord, depth + 1)) {
228 addWord(mWord, depth + 1,
229 (freq * snr * addedWeight * mFullWordMultiplier));
230 }
231 }
232 if (childrenAddress != 0) {
233 getWordsRec(childrenAddress, depth + 1,
234 maxDepth, true, snr * addedWeight, inputIndex + 1);
235 }
236 } else if (childrenAddress != 0) {
237 getWordsRec(childrenAddress, depth + 1, maxDepth,
238 false, snr * addedWeight, inputIndex + 1);
239 }
240 }
241 j++;
242 }
243 }
244 }
245 }
246
247 bool
isValidWord(unsigned short * word,int length)248 Dictionary::isValidWord(unsigned short *word, int length)
249 {
250 return isValidWordRec(0, word, 0, length);
251 }
252
253 bool
isValidWordRec(int pos,unsigned short * word,int offset,int length)254 Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
255 int count = getCount(&pos);
256 unsigned short currentChar = (unsigned short) word[offset];
257 for (int j = 0; j < count; j++) {
258 unsigned short c = getChar(&pos);
259 int terminal = getTerminal(&pos);
260 int childPos = getAddress(&pos);
261 if (c == currentChar) {
262 if (offset == length - 1) {
263 if (terminal) {
264 return true;
265 }
266 } else {
267 if (childPos != 0) {
268 if (isValidWordRec(childPos, word, offset + 1, length)) {
269 return true;
270 }
271 }
272 }
273 }
274 if (terminal) {
275 getFreq(&pos);
276 }
277 // There could be two instances of each alphabet - upper and lower case. So continue
278 // looking ...
279 }
280 return false;
281 }
282
283
284 } // namespace latinime
285