• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 #include <string.h>
22 #include <cutils/log.h>
23 
24 #include <unicode/uchar.h>
25 
26 //#define USE_ASSET_MANAGER
27 
28 #ifdef USE_ASSET_MANAGER
29 #include <utils/AssetManager.h>
30 #include <utils/Asset.h>
31 #endif
32 
33 #include "dictionary.h"
34 #include "basechars.h"
35 
36 #define DEBUG_DICT 0
37 
38 namespace latinime {
39 
Dictionary(void * dict,int typedLetterMultiplier,int fullWordMultiplier)40 Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
41 {
42     mDict = (unsigned char*) dict;
43     mTypedLetterMultiplier = typedLetterMultiplier;
44     mFullWordMultiplier = fullWordMultiplier;
45 }
46 
~Dictionary()47 Dictionary::~Dictionary()
48 {
49 }
50 
getSuggestions(int * codes,int codesSize,unsigned short * outWords,int * frequencies,int maxWordLength,int maxWords,int maxAlternatives,int skipPos)51 int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
52         int maxWordLength, int maxWords, int maxAlternatives, int skipPos)
53 {
54     mFrequencies = frequencies;
55     mOutputChars = outWords;
56     mInputCodes = codes;
57     mInputLength = codesSize;
58     mMaxAlternatives = maxAlternatives;
59     mMaxWordLength = maxWordLength;
60     mMaxWords = maxWords;
61     mWords = 0;
62     mSkipPos = skipPos;
63     mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
64 
65     getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0);
66 
67     if (DEBUG_DICT) LOGI("Returning %d words", mWords);
68     return mWords;
69 }
70 
71 unsigned short
getChar(int * pos)72 Dictionary::getChar(int *pos)
73 {
74     unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
75     // If the code is 255, then actual 16 bit code follows (in big endian)
76     if (ch == 0xFF) {
77         ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
78         (*pos) += 2;
79     }
80     return ch;
81 }
82 
83 int
getAddress(int * pos)84 Dictionary::getAddress(int *pos)
85 {
86     int address = 0;
87     if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
88         *pos += 1;
89     } else {
90         address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
91         address += (mDict[*pos + 1] & 0xFF) << 8;
92         address += (mDict[*pos + 2] & 0xFF);
93         *pos += 3;
94     }
95     return address;
96 }
97 
98 int
wideStrLen(unsigned short * str)99 Dictionary::wideStrLen(unsigned short *str)
100 {
101     if (!str) return 0;
102     unsigned short *end = str;
103     while (*end)
104         end++;
105     return end - str;
106 }
107 
108 bool
addWord(unsigned short * word,int length,int frequency)109 Dictionary::addWord(unsigned short *word, int length, int frequency)
110 {
111     word[length] = 0;
112     if (DEBUG_DICT) {
113         char s[length + 1];
114         for (int i = 0; i <= length; i++) s[i] = word[i];
115         LOGI("Found word = %s, freq = %d : \n", s, frequency);
116     }
117 
118     // Find the right insertion point
119     int insertAt = 0;
120     while (insertAt < mMaxWords) {
121         if (frequency > mFrequencies[insertAt]
122                  || (mFrequencies[insertAt] == frequency
123                      && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
124             break;
125         }
126         insertAt++;
127     }
128     if (insertAt < mMaxWords) {
129         memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
130                (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
131                (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
132         mFrequencies[insertAt] = frequency;
133         memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
134                (char*) mOutputChars + (insertAt    ) * mMaxWordLength * sizeof(short),
135                (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
136         unsigned short *dest = mOutputChars + (insertAt    ) * mMaxWordLength;
137         while (length--) {
138             *dest++ = *word++;
139         }
140         *dest = 0; // NULL terminate
141         // Update the word count
142         if (insertAt + 1 > mWords) mWords = insertAt + 1;
143         if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
144         return true;
145     }
146     return false;
147 }
148 
149 unsigned short
toLowerCase(unsigned short c)150 Dictionary::toLowerCase(unsigned short c) {
151     if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
152         c = BASE_CHARS[c];
153     }
154     if (c >='A' && c <= 'Z') {
155         c |= 32;
156     } else if (c > 127) {
157         c = u_tolower(c);
158     }
159     return c;
160 }
161 
162 bool
sameAsTyped(unsigned short * word,int length)163 Dictionary::sameAsTyped(unsigned short *word, int length)
164 {
165     if (length != mInputLength) {
166         return false;
167     }
168     int *inputCodes = mInputCodes;
169     while (length--) {
170         if ((unsigned int) *inputCodes != (unsigned int) *word) {
171             return false;
172         }
173         inputCodes += mMaxAlternatives;
174         word++;
175     }
176     return true;
177 }
178 
179 static char QUOTE = '\'';
180 
181 void
getWordsRec(int pos,int depth,int maxDepth,bool completion,int snr,int inputIndex,int diffs)182 Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
183                         int diffs)
184 {
185     // Optimization: Prune out words that are too long compared to how much was typed.
186     if (depth > maxDepth) {
187         return;
188     }
189     if (diffs > mMaxEditDistance) {
190         return;
191     }
192     int count = getCount(&pos);
193     int *currentChars = NULL;
194     if (mInputLength <= inputIndex) {
195         completion = true;
196     } else {
197         currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
198     }
199 
200     for (int i = 0; i < count; i++) {
201         unsigned short c = getChar(&pos);
202         unsigned short lowerC = toLowerCase(c);
203         bool terminal = getTerminal(&pos);
204         int childrenAddress = getAddress(&pos);
205         int freq = 1;
206         if (terminal) freq = getFreq(&pos);
207         // If we are only doing completions, no need to look at the typed characters.
208         if (completion) {
209             mWord[depth] = c;
210             if (terminal) {
211                 addWord(mWord, depth + 1, freq * snr);
212             }
213             if (childrenAddress != 0) {
214                 getWordsRec(childrenAddress, depth + 1, maxDepth,
215                             completion, snr, inputIndex, diffs);
216             }
217         } else if (c == QUOTE && currentChars[0] != QUOTE || mSkipPos == depth) {
218             // Skip the ' or other letter and continue deeper
219             mWord[depth] = c;
220             if (childrenAddress != 0) {
221                 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs);
222             }
223         } else {
224             int j = 0;
225             while (currentChars[j] > 0) {
226                 if (currentChars[j] == lowerC || currentChars[j] == c) {
227                     int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
228                     mWord[depth] = c;
229                     if (mInputLength == inputIndex + 1) {
230                         if (terminal) {
231                             if (//INCLUDE_TYPED_WORD_IF_VALID ||
232                                 !sameAsTyped(mWord, depth + 1)) {
233                                 int finalFreq = freq * snr * addedWeight;
234                                 if (mSkipPos < 0) finalFreq *= mFullWordMultiplier;
235                                 addWord(mWord, depth + 1, finalFreq);
236                             }
237                         }
238                         if (childrenAddress != 0) {
239                             getWordsRec(childrenAddress, depth + 1,
240                                     maxDepth, true, snr * addedWeight, inputIndex + 1,
241                                     diffs + (j > 0));
242                         }
243                     } else if (childrenAddress != 0) {
244                         getWordsRec(childrenAddress, depth + 1, maxDepth,
245                                 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0));
246                     }
247                 }
248                 j++;
249                 if (mSkipPos >= 0) break;
250             }
251         }
252     }
253 }
254 
255 bool
isValidWord(unsigned short * word,int length)256 Dictionary::isValidWord(unsigned short *word, int length)
257 {
258     return isValidWordRec(0, word, 0, length);
259 }
260 
261 bool
isValidWordRec(int pos,unsigned short * word,int offset,int length)262 Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
263     int count = getCount(&pos);
264     unsigned short currentChar = (unsigned short) word[offset];
265     for (int j = 0; j < count; j++) {
266         unsigned short c = getChar(&pos);
267         int terminal = getTerminal(&pos);
268         int childPos = getAddress(&pos);
269         if (c == currentChar) {
270             if (offset == length - 1) {
271                 if (terminal) {
272                     return true;
273                 }
274             } else {
275                 if (childPos != 0) {
276                     if (isValidWordRec(childPos, word, offset + 1, length)) {
277                         return true;
278                     }
279                 }
280             }
281         }
282         if (terminal) {
283             getFreq(&pos);
284         }
285         // There could be two instances of each alphabet - upper and lower case. So continue
286         // looking ...
287     }
288     return false;
289 }
290 
291 
292 } // namespace latinime
293