• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 #include <string.h>
22 #include <cutils/log.h>
23 
24 #include <unicode/uchar.h>
25 
26 //#define USE_ASSET_MANAGER
27 
28 #ifdef USE_ASSET_MANAGER
29 #include <utils/AssetManager.h>
30 #include <utils/Asset.h>
31 #endif
32 
33 #include "dictionary.h"
34 #include "basechars.h"
35 
36 #define DEBUG_DICT 0
37 
38 namespace latinime {
39 
Dictionary(void * dict,int typedLetterMultiplier,int fullWordMultiplier)40 Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
41 {
42     mDict = (unsigned char*) dict;
43     mTypedLetterMultiplier = typedLetterMultiplier;
44     mFullWordMultiplier = fullWordMultiplier;
45 }
46 
~Dictionary()47 Dictionary::~Dictionary()
48 {
49 }
50 
getSuggestions(int * codes,int codesSize,unsigned short * outWords,int * frequencies,int maxWordLength,int maxWords,int maxAlternatives)51 int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
52         int maxWordLength, int maxWords, int maxAlternatives)
53 {
54     memset(frequencies, 0, maxWords * sizeof(*frequencies));
55     memset(outWords, 0, maxWords * maxWordLength * sizeof(*outWords));
56 
57     mFrequencies = frequencies;
58     mOutputChars = outWords;
59     mInputCodes = codes;
60     mInputLength = codesSize;
61     mMaxAlternatives = maxAlternatives;
62     mMaxWordLength = maxWordLength;
63     mMaxWords = maxWords;
64     mWords = 0;
65 
66     getWordsRec(0, 0, mInputLength * 3, false, 1, 0);
67 
68     if (DEBUG_DICT) LOGI("Returning %d words", mWords);
69     return mWords;
70 }
71 
72 unsigned short
getChar(int * pos)73 Dictionary::getChar(int *pos)
74 {
75     unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
76     // If the code is 255, then actual 16 bit code follows (in big endian)
77     if (ch == 0xFF) {
78         ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
79         (*pos) += 2;
80     }
81     return ch;
82 }
83 
84 int
getAddress(int * pos)85 Dictionary::getAddress(int *pos)
86 {
87     int address = 0;
88     if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
89         *pos += 1;
90     } else {
91         address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
92         address += (mDict[*pos + 1] & 0xFF) << 8;
93         address += (mDict[*pos + 2] & 0xFF);
94         *pos += 3;
95     }
96     return address;
97 }
98 
99 int
wideStrLen(unsigned short * str)100 Dictionary::wideStrLen(unsigned short *str)
101 {
102     if (!str) return 0;
103     unsigned short *end = str;
104     while (*end)
105         end++;
106     return end - str;
107 }
108 
109 bool
addWord(unsigned short * word,int length,int frequency)110 Dictionary::addWord(unsigned short *word, int length, int frequency)
111 {
112     word[length] = 0;
113     if (DEBUG_DICT) LOGI("Found word = %s, freq = %d : \n", word, frequency);
114 
115     // Find the right insertion point
116     int insertAt = 0;
117     while (insertAt < mMaxWords) {
118         if (frequency > mFrequencies[insertAt]
119                  || (mFrequencies[insertAt] == frequency
120                      && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
121             break;
122         }
123         insertAt++;
124     }
125     if (insertAt < mMaxWords) {
126         memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
127                (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
128                (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
129         mFrequencies[insertAt] = frequency;
130         memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
131                (char*) mOutputChars + (insertAt    ) * mMaxWordLength * sizeof(short),
132                (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
133         unsigned short *dest = mOutputChars + (insertAt    ) * mMaxWordLength;
134         while (length--) {
135             *dest++ = *word++;
136         }
137         *dest = 0; // NULL terminate
138         // Update the word count
139         if (insertAt + 1 > mWords) mWords = insertAt + 1;
140         if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
141         return true;
142     }
143     return false;
144 }
145 
146 unsigned short
toLowerCase(unsigned short c,const int depth)147 Dictionary::toLowerCase(unsigned short c, const int depth) {
148     if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
149         c = BASE_CHARS[c];
150     }
151     if (depth == 0) {
152         if (c >='A' && c <= 'Z') {
153             c |= 32;
154         } else if (c > 127) {
155             c = u_tolower(c);
156         }
157     }
158     return c;
159 }
160 
161 bool
sameAsTyped(unsigned short * word,int length)162 Dictionary::sameAsTyped(unsigned short *word, int length)
163 {
164     if (length != mInputLength) {
165         return false;
166     }
167     int *inputCodes = mInputCodes;
168     while (length--) {
169         if ((unsigned int) *inputCodes != (unsigned int) *word) {
170             return false;
171         }
172         inputCodes += mMaxAlternatives;
173         word++;
174     }
175     return true;
176 }
177 
178 static char QUOTE = '\'';
179 
180 void
getWordsRec(int pos,int depth,int maxDepth,bool completion,int snr,int inputIndex)181 Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex)
182 {
183     // Optimization: Prune out words that are too long compared to how much was typed.
184     if (depth > maxDepth) {
185         return;
186     }
187     int count = getCount(&pos);
188     int *currentChars = NULL;
189     if (mInputLength <= inputIndex) {
190         completion = true;
191     } else {
192         currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
193     }
194 
195     for (int i = 0; i < count; i++) {
196         unsigned short c = getChar(&pos);
197         unsigned short lowerC = toLowerCase(c, depth);
198         bool terminal = getTerminal(&pos);
199         int childrenAddress = getAddress(&pos);
200         int freq = 1;
201         if (terminal) freq = getFreq(&pos);
202         // If we are only doing completions, no need to look at the typed characters.
203         if (completion) {
204             mWord[depth] = c;
205             if (terminal) {
206                 addWord(mWord, depth + 1, freq * snr);
207             }
208             if (childrenAddress != 0) {
209                 getWordsRec(childrenAddress, depth + 1, maxDepth,
210                             completion, snr, inputIndex);
211             }
212         } else if (c == QUOTE && currentChars[0] != QUOTE) {
213             // Skip the ' and continue deeper
214             mWord[depth] = QUOTE;
215             if (childrenAddress != 0) {
216                 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex);
217             }
218         } else {
219             int j = 0;
220             while (currentChars[j] > 0) {
221                 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
222                 if (currentChars[j] == lowerC || currentChars[j] == c) {
223                     mWord[depth] = c;
224                     if (mInputLength == inputIndex + 1) {
225                         if (terminal) {
226                             if (//INCLUDE_TYPED_WORD_IF_VALID ||
227                                 !sameAsTyped(mWord, depth + 1)) {
228                                 addWord(mWord, depth + 1,
229                                     (freq * snr * addedWeight * mFullWordMultiplier));
230                             }
231                         }
232                         if (childrenAddress != 0) {
233                             getWordsRec(childrenAddress, depth + 1,
234                                     maxDepth, true, snr * addedWeight, inputIndex + 1);
235                         }
236                     } else if (childrenAddress != 0) {
237                         getWordsRec(childrenAddress, depth + 1, maxDepth,
238                                 false, snr * addedWeight, inputIndex + 1);
239                     }
240                 }
241                 j++;
242             }
243         }
244     }
245 }
246 
247 bool
isValidWord(unsigned short * word,int length)248 Dictionary::isValidWord(unsigned short *word, int length)
249 {
250     return isValidWordRec(0, word, 0, length);
251 }
252 
253 bool
isValidWordRec(int pos,unsigned short * word,int offset,int length)254 Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
255     int count = getCount(&pos);
256     unsigned short currentChar = (unsigned short) word[offset];
257     for (int j = 0; j < count; j++) {
258         unsigned short c = getChar(&pos);
259         int terminal = getTerminal(&pos);
260         int childPos = getAddress(&pos);
261         if (c == currentChar) {
262             if (offset == length - 1) {
263                 if (terminal) {
264                     return true;
265                 }
266             } else {
267                 if (childPos != 0) {
268                     if (isValidWordRec(childPos, word, offset + 1, length)) {
269                         return true;
270                     }
271                 }
272             }
273         }
274         if (terminal) {
275             getFreq(&pos);
276         }
277         // There could be two instances of each alphabet - upper and lower case. So continue
278         // looking ...
279     }
280     return false;
281 }
282 
283 
284 } // namespace latinime
285