1 /*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 ** http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 #include <string.h>
22 #include <cutils/log.h>
23
24 #include <unicode/uchar.h>
25
26 //#define USE_ASSET_MANAGER
27
28 #ifdef USE_ASSET_MANAGER
29 #include <utils/AssetManager.h>
30 #include <utils/Asset.h>
31 #endif
32
33 #include "dictionary.h"
34 #include "basechars.h"
35
36 #define DEBUG_DICT 0
37
38 namespace latinime {
39
Dictionary(void * dict,int typedLetterMultiplier,int fullWordMultiplier)40 Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
41 {
42 mDict = (unsigned char*) dict;
43 mTypedLetterMultiplier = typedLetterMultiplier;
44 mFullWordMultiplier = fullWordMultiplier;
45 }
46
~Dictionary()47 Dictionary::~Dictionary()
48 {
49 }
50
getSuggestions(int * codes,int codesSize,unsigned short * outWords,int * frequencies,int maxWordLength,int maxWords,int maxAlternatives,int skipPos)51 int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
52 int maxWordLength, int maxWords, int maxAlternatives, int skipPos)
53 {
54 mFrequencies = frequencies;
55 mOutputChars = outWords;
56 mInputCodes = codes;
57 mInputLength = codesSize;
58 mMaxAlternatives = maxAlternatives;
59 mMaxWordLength = maxWordLength;
60 mMaxWords = maxWords;
61 mWords = 0;
62 mSkipPos = skipPos;
63 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
64
65 getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0);
66
67 if (DEBUG_DICT) LOGI("Returning %d words", mWords);
68 return mWords;
69 }
70
71 unsigned short
getChar(int * pos)72 Dictionary::getChar(int *pos)
73 {
74 unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
75 // If the code is 255, then actual 16 bit code follows (in big endian)
76 if (ch == 0xFF) {
77 ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
78 (*pos) += 2;
79 }
80 return ch;
81 }
82
83 int
getAddress(int * pos)84 Dictionary::getAddress(int *pos)
85 {
86 int address = 0;
87 if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
88 *pos += 1;
89 } else {
90 address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
91 address += (mDict[*pos + 1] & 0xFF) << 8;
92 address += (mDict[*pos + 2] & 0xFF);
93 *pos += 3;
94 }
95 return address;
96 }
97
98 int
wideStrLen(unsigned short * str)99 Dictionary::wideStrLen(unsigned short *str)
100 {
101 if (!str) return 0;
102 unsigned short *end = str;
103 while (*end)
104 end++;
105 return end - str;
106 }
107
108 bool
addWord(unsigned short * word,int length,int frequency)109 Dictionary::addWord(unsigned short *word, int length, int frequency)
110 {
111 word[length] = 0;
112 if (DEBUG_DICT) {
113 char s[length + 1];
114 for (int i = 0; i <= length; i++) s[i] = word[i];
115 LOGI("Found word = %s, freq = %d : \n", s, frequency);
116 }
117
118 // Find the right insertion point
119 int insertAt = 0;
120 while (insertAt < mMaxWords) {
121 if (frequency > mFrequencies[insertAt]
122 || (mFrequencies[insertAt] == frequency
123 && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
124 break;
125 }
126 insertAt++;
127 }
128 if (insertAt < mMaxWords) {
129 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
130 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
131 (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
132 mFrequencies[insertAt] = frequency;
133 memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
134 (char*) mOutputChars + (insertAt ) * mMaxWordLength * sizeof(short),
135 (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
136 unsigned short *dest = mOutputChars + (insertAt ) * mMaxWordLength;
137 while (length--) {
138 *dest++ = *word++;
139 }
140 *dest = 0; // NULL terminate
141 // Update the word count
142 if (insertAt + 1 > mWords) mWords = insertAt + 1;
143 if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
144 return true;
145 }
146 return false;
147 }
148
149 unsigned short
toLowerCase(unsigned short c)150 Dictionary::toLowerCase(unsigned short c) {
151 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
152 c = BASE_CHARS[c];
153 }
154 if (c >='A' && c <= 'Z') {
155 c |= 32;
156 } else if (c > 127) {
157 c = u_tolower(c);
158 }
159 return c;
160 }
161
162 bool
sameAsTyped(unsigned short * word,int length)163 Dictionary::sameAsTyped(unsigned short *word, int length)
164 {
165 if (length != mInputLength) {
166 return false;
167 }
168 int *inputCodes = mInputCodes;
169 while (length--) {
170 if ((unsigned int) *inputCodes != (unsigned int) *word) {
171 return false;
172 }
173 inputCodes += mMaxAlternatives;
174 word++;
175 }
176 return true;
177 }
178
179 static char QUOTE = '\'';
180
181 void
getWordsRec(int pos,int depth,int maxDepth,bool completion,int snr,int inputIndex,int diffs)182 Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
183 int diffs)
184 {
185 // Optimization: Prune out words that are too long compared to how much was typed.
186 if (depth > maxDepth) {
187 return;
188 }
189 if (diffs > mMaxEditDistance) {
190 return;
191 }
192 int count = getCount(&pos);
193 int *currentChars = NULL;
194 if (mInputLength <= inputIndex) {
195 completion = true;
196 } else {
197 currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
198 }
199
200 for (int i = 0; i < count; i++) {
201 unsigned short c = getChar(&pos);
202 unsigned short lowerC = toLowerCase(c);
203 bool terminal = getTerminal(&pos);
204 int childrenAddress = getAddress(&pos);
205 int freq = 1;
206 if (terminal) freq = getFreq(&pos);
207 // If we are only doing completions, no need to look at the typed characters.
208 if (completion) {
209 mWord[depth] = c;
210 if (terminal) {
211 addWord(mWord, depth + 1, freq * snr);
212 }
213 if (childrenAddress != 0) {
214 getWordsRec(childrenAddress, depth + 1, maxDepth,
215 completion, snr, inputIndex, diffs);
216 }
217 } else if (c == QUOTE && currentChars[0] != QUOTE || mSkipPos == depth) {
218 // Skip the ' or other letter and continue deeper
219 mWord[depth] = c;
220 if (childrenAddress != 0) {
221 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs);
222 }
223 } else {
224 int j = 0;
225 while (currentChars[j] > 0) {
226 if (currentChars[j] == lowerC || currentChars[j] == c) {
227 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
228 mWord[depth] = c;
229 if (mInputLength == inputIndex + 1) {
230 if (terminal) {
231 if (//INCLUDE_TYPED_WORD_IF_VALID ||
232 !sameAsTyped(mWord, depth + 1)) {
233 int finalFreq = freq * snr * addedWeight;
234 if (mSkipPos < 0) finalFreq *= mFullWordMultiplier;
235 addWord(mWord, depth + 1, finalFreq);
236 }
237 }
238 if (childrenAddress != 0) {
239 getWordsRec(childrenAddress, depth + 1,
240 maxDepth, true, snr * addedWeight, inputIndex + 1,
241 diffs + (j > 0));
242 }
243 } else if (childrenAddress != 0) {
244 getWordsRec(childrenAddress, depth + 1, maxDepth,
245 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0));
246 }
247 }
248 j++;
249 if (mSkipPos >= 0) break;
250 }
251 }
252 }
253 }
254
255 bool
isValidWord(unsigned short * word,int length)256 Dictionary::isValidWord(unsigned short *word, int length)
257 {
258 return isValidWordRec(0, word, 0, length);
259 }
260
261 bool
isValidWordRec(int pos,unsigned short * word,int offset,int length)262 Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
263 int count = getCount(&pos);
264 unsigned short currentChar = (unsigned short) word[offset];
265 for (int j = 0; j < count; j++) {
266 unsigned short c = getChar(&pos);
267 int terminal = getTerminal(&pos);
268 int childPos = getAddress(&pos);
269 if (c == currentChar) {
270 if (offset == length - 1) {
271 if (terminal) {
272 return true;
273 }
274 } else {
275 if (childPos != 0) {
276 if (isValidWordRec(childPos, word, offset + 1, length)) {
277 return true;
278 }
279 }
280 }
281 }
282 if (terminal) {
283 getFreq(&pos);
284 }
285 // There could be two instances of each alphabet - upper and lower case. So continue
286 // looking ...
287 }
288 return false;
289 }
290
291
292 } // namespace latinime
293