• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 #include <string.h>
22 //#define LOG_TAG "dictionary.cpp"
23 //#include <cutils/log.h>
24 #define LOGI
25 
26 #include "dictionary.h"
27 #include "basechars.h"
28 #include "char_utils.h"
29 
30 #define DEBUG_DICT 0
31 #define DICTIONARY_VERSION_MIN 200
32 #define DICTIONARY_HEADER_SIZE 2
33 #define NOT_VALID_WORD -99
34 
35 namespace latinime {
36 
Dictionary(void * dict,int typedLetterMultiplier,int fullWordMultiplier)37 Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
38 {
39     mDict = (unsigned char*) dict;
40     mTypedLetterMultiplier = typedLetterMultiplier;
41     mFullWordMultiplier = fullWordMultiplier;
42     getVersionNumber();
43 }
44 
~Dictionary()45 Dictionary::~Dictionary()
46 {
47 }
48 
getSuggestions(int * codes,int codesSize,unsigned short * outWords,int * frequencies,int maxWordLength,int maxWords,int maxAlternatives,int skipPos,int * nextLetters,int nextLettersSize)49 int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
50         int maxWordLength, int maxWords, int maxAlternatives, int skipPos,
51         int *nextLetters, int nextLettersSize)
52 {
53     int suggWords;
54     mFrequencies = frequencies;
55     mOutputChars = outWords;
56     mInputCodes = codes;
57     mInputLength = codesSize;
58     mMaxAlternatives = maxAlternatives;
59     mMaxWordLength = maxWordLength;
60     mMaxWords = maxWords;
61     mSkipPos = skipPos;
62     mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
63     mNextLettersFrequencies = nextLetters;
64     mNextLettersSize = nextLettersSize;
65 
66     if (checkIfDictVersionIsLatest()) {
67         getWordsRec(DICTIONARY_HEADER_SIZE, 0, mInputLength * 3, false, 1, 0, 0);
68     } else {
69         getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0);
70     }
71 
72     // Get the word count
73     suggWords = 0;
74     while (suggWords < mMaxWords && mFrequencies[suggWords] > 0) suggWords++;
75     if (DEBUG_DICT) LOGI("Returning %d words", suggWords);
76 
77     if (DEBUG_DICT) {
78         LOGI("Next letters: ");
79         for (int k = 0; k < nextLettersSize; k++) {
80             if (mNextLettersFrequencies[k] > 0) {
81                 LOGI("%c = %d,", k, mNextLettersFrequencies[k]);
82             }
83         }
84         LOGI("\n");
85     }
86     return suggWords;
87 }
88 
89 void
registerNextLetter(unsigned short c)90 Dictionary::registerNextLetter(unsigned short c)
91 {
92     if (c < mNextLettersSize) {
93         mNextLettersFrequencies[c]++;
94     }
95 }
96 
97 void
getVersionNumber()98 Dictionary::getVersionNumber()
99 {
100     mVersion = (mDict[0] & 0xFF);
101     mBigram = (mDict[1] & 0xFF);
102     LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
103 }
104 
105 // Checks whether it has the latest dictionary or the old dictionary
106 bool
checkIfDictVersionIsLatest()107 Dictionary::checkIfDictVersionIsLatest()
108 {
109     return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
110 }
111 
112 unsigned short
getChar(int * pos)113 Dictionary::getChar(int *pos)
114 {
115     unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
116     // If the code is 255, then actual 16 bit code follows (in big endian)
117     if (ch == 0xFF) {
118         ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
119         (*pos) += 2;
120     }
121     return ch;
122 }
123 
124 int
getAddress(int * pos)125 Dictionary::getAddress(int *pos)
126 {
127     int address = 0;
128     if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
129         *pos += 1;
130     } else {
131         address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
132         address += (mDict[*pos + 1] & 0xFF) << 8;
133         address += (mDict[*pos + 2] & 0xFF);
134         *pos += 3;
135     }
136     return address;
137 }
138 
139 int
getFreq(int * pos)140 Dictionary::getFreq(int *pos)
141 {
142     int freq = mDict[(*pos)++] & 0xFF;
143 
144     if (checkIfDictVersionIsLatest()) {
145         // skipping bigram
146         int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
147         if (bigramExist > 0) {
148             int nextBigramExist = 1;
149             while (nextBigramExist > 0) {
150                 (*pos) += 3;
151                 nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
152             }
153         } else {
154             (*pos)++;
155         }
156     }
157 
158     return freq;
159 }
160 
161 int
wideStrLen(unsigned short * str)162 Dictionary::wideStrLen(unsigned short *str)
163 {
164     if (!str) return 0;
165     unsigned short *end = str;
166     while (*end)
167         end++;
168     return end - str;
169 }
170 
171 bool
addWord(unsigned short * word,int length,int frequency)172 Dictionary::addWord(unsigned short *word, int length, int frequency)
173 {
174     word[length] = 0;
175     if (DEBUG_DICT) {
176         char s[length + 1];
177         for (int i = 0; i <= length; i++) s[i] = word[i];
178         LOGI("Found word = %s, freq = %d : \n", s, frequency);
179     }
180 
181     // Find the right insertion point
182     int insertAt = 0;
183     while (insertAt < mMaxWords) {
184         if (frequency > mFrequencies[insertAt]
185                  || (mFrequencies[insertAt] == frequency
186                      && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
187             break;
188         }
189         insertAt++;
190     }
191     if (insertAt < mMaxWords) {
192         memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
193                (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
194                (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
195         mFrequencies[insertAt] = frequency;
196         memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
197                (char*) mOutputChars + (insertAt    ) * mMaxWordLength * sizeof(short),
198                (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
199         unsigned short *dest = mOutputChars + (insertAt    ) * mMaxWordLength;
200         while (length--) {
201             *dest++ = *word++;
202         }
203         *dest = 0; // NULL terminate
204         if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
205         return true;
206     }
207     return false;
208 }
209 
210 bool
addWordBigram(unsigned short * word,int length,int frequency)211 Dictionary::addWordBigram(unsigned short *word, int length, int frequency)
212 {
213     word[length] = 0;
214     if (DEBUG_DICT) {
215         char s[length + 1];
216         for (int i = 0; i <= length; i++) s[i] = word[i];
217         LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
218     }
219 
220     // Find the right insertion point
221     int insertAt = 0;
222     while (insertAt < mMaxBigrams) {
223         if (frequency > mBigramFreq[insertAt]
224                  || (mBigramFreq[insertAt] == frequency
225                      && length < wideStrLen(mBigramChars + insertAt * mMaxWordLength))) {
226             break;
227         }
228         insertAt++;
229     }
230     LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
231     if (insertAt < mMaxBigrams) {
232         memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
233                (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
234                (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
235         mBigramFreq[insertAt] = frequency;
236         memmove((char*) mBigramChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
237                (char*) mBigramChars + (insertAt    ) * mMaxWordLength * sizeof(short),
238                (mMaxBigrams - insertAt - 1) * sizeof(short) * mMaxWordLength);
239         unsigned short *dest = mBigramChars + (insertAt    ) * mMaxWordLength;
240         while (length--) {
241             *dest++ = *word++;
242         }
243         *dest = 0; // NULL terminate
244         if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
245         return true;
246     }
247     return false;
248 }
249 
250 unsigned short
toLowerCase(unsigned short c)251 Dictionary::toLowerCase(unsigned short c) {
252     if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
253         c = BASE_CHARS[c];
254     }
255     if (c >='A' && c <= 'Z') {
256         c |= 32;
257     } else if (c > 127) {
258         c = latin_tolower(c);
259     }
260     return c;
261 }
262 
263 bool
sameAsTyped(unsigned short * word,int length)264 Dictionary::sameAsTyped(unsigned short *word, int length)
265 {
266     if (length != mInputLength) {
267         return false;
268     }
269     int *inputCodes = mInputCodes;
270     while (length--) {
271         if ((unsigned int) *inputCodes != (unsigned int) *word) {
272             return false;
273         }
274         inputCodes += mMaxAlternatives;
275         word++;
276     }
277     return true;
278 }
279 
280 static char QUOTE = '\'';
281 
282 void
getWordsRec(int pos,int depth,int maxDepth,bool completion,int snr,int inputIndex,int diffs)283 Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
284                         int diffs)
285 {
286     // Optimization: Prune out words that are too long compared to how much was typed.
287     if (depth > maxDepth) {
288         return;
289     }
290     if (diffs > mMaxEditDistance) {
291         return;
292     }
293     int count = getCount(&pos);
294     int *currentChars = NULL;
295     if (mInputLength <= inputIndex) {
296         completion = true;
297     } else {
298         currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
299     }
300 
301     for (int i = 0; i < count; i++) {
302         // -- at char
303         unsigned short c = getChar(&pos);
304         // -- at flag/add
305         unsigned short lowerC = toLowerCase(c);
306         bool terminal = getTerminal(&pos);
307         int childrenAddress = getAddress(&pos);
308         // -- after address or flag
309         int freq = 1;
310         if (terminal) freq = getFreq(&pos);
311         // -- after add or freq
312 
313         // If we are only doing completions, no need to look at the typed characters.
314         if (completion) {
315             mWord[depth] = c;
316             if (terminal) {
317                 addWord(mWord, depth + 1, freq * snr);
318                 if (depth >= mInputLength && mSkipPos < 0) {
319                     registerNextLetter(mWord[mInputLength]);
320                 }
321             }
322             if (childrenAddress != 0) {
323                 getWordsRec(childrenAddress, depth + 1, maxDepth,
324                             completion, snr, inputIndex, diffs);
325             }
326         } else if ((c == QUOTE && currentChars[0] != QUOTE) || mSkipPos == depth) {
327             // Skip the ' or other letter and continue deeper
328             mWord[depth] = c;
329             if (childrenAddress != 0) {
330                 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs);
331             }
332         } else {
333             int j = 0;
334             while (currentChars[j] > 0) {
335                 if (currentChars[j] == lowerC || currentChars[j] == c) {
336                     int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
337                     mWord[depth] = c;
338                     if (mInputLength == inputIndex + 1) {
339                         if (terminal) {
340                             if (//INCLUDE_TYPED_WORD_IF_VALID ||
341                                 !sameAsTyped(mWord, depth + 1)) {
342                                 int finalFreq = freq * snr * addedWeight;
343                                 if (mSkipPos < 0) finalFreq *= mFullWordMultiplier;
344                                 addWord(mWord, depth + 1, finalFreq);
345                             }
346                         }
347                         if (childrenAddress != 0) {
348                             getWordsRec(childrenAddress, depth + 1,
349                                     maxDepth, true, snr * addedWeight, inputIndex + 1,
350                                     diffs + (j > 0));
351                         }
352                     } else if (childrenAddress != 0) {
353                         getWordsRec(childrenAddress, depth + 1, maxDepth,
354                                 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0));
355                     }
356                 }
357                 j++;
358                 if (mSkipPos >= 0) break;
359             }
360         }
361     }
362 }
363 
364 int
getBigramAddress(int * pos,bool advance)365 Dictionary::getBigramAddress(int *pos, bool advance)
366 {
367     int address = 0;
368 
369     address += (mDict[*pos] & 0x3F) << 16;
370     address += (mDict[*pos + 1] & 0xFF) << 8;
371     address += (mDict[*pos + 2] & 0xFF);
372 
373     if (advance) {
374         *pos += 3;
375     }
376 
377     return address;
378 }
379 
380 int
getBigramFreq(int * pos)381 Dictionary::getBigramFreq(int *pos)
382 {
383     int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
384 
385     return freq;
386 }
387 
388 
389 int
getBigrams(unsigned short * prevWord,int prevWordLength,int * codes,int codesSize,unsigned short * bigramChars,int * bigramFreq,int maxWordLength,int maxBigrams,int maxAlternatives)390 Dictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
391         unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
392         int maxAlternatives)
393 {
394     mBigramFreq = bigramFreq;
395     mBigramChars = bigramChars;
396     mInputCodes = codes;
397     mInputLength = codesSize;
398     mMaxWordLength = maxWordLength;
399     mMaxBigrams = maxBigrams;
400     mMaxAlternatives = maxAlternatives;
401 
402     if (mBigram == 1 && checkIfDictVersionIsLatest()) {
403         int pos = isValidWordRec(DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
404         LOGI("Pos -> %d\n", pos);
405         if (pos < 0) {
406             return 0;
407         }
408 
409         int bigramCount = 0;
410         int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
411         if (bigramExist > 0) {
412             int nextBigramExist = 1;
413             while (nextBigramExist > 0 && bigramCount < maxBigrams) {
414                 int bigramAddress = getBigramAddress(&pos, true);
415                 int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
416                 // search for all bigrams and store them
417                 searchForTerminalNode(bigramAddress, frequency);
418                 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
419                 bigramCount++;
420             }
421         }
422 
423         return bigramCount;
424     }
425     return 0;
426 }
427 
428 void
searchForTerminalNode(int addressLookingFor,int frequency)429 Dictionary::searchForTerminalNode(int addressLookingFor, int frequency)
430 {
431     // track word with such address and store it in an array
432     unsigned short word[mMaxWordLength];
433 
434     int pos;
435     int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
436     bool found = false;
437     char followingChar = ' ';
438     int depth = -1;
439 
440     while(!found) {
441         bool followDownAddressSearchStop = false;
442         bool firstAddress = true;
443         bool haveToSearchAll = true;
444 
445         if (depth >= 0) {
446             word[depth] = (unsigned short) followingChar;
447         }
448         pos = followDownBranchAddress; // pos start at count
449         int count = mDict[pos] & 0xFF;
450         LOGI("count - %d\n",count);
451         pos++;
452         for (int i = 0; i < count; i++) {
453             // pos at data
454             pos++;
455             // pos now at flag
456             if (!getFirstBitOfByte(&pos)) { // non-terminal
457                 if (!followDownAddressSearchStop) {
458                     int addr = getBigramAddress(&pos, false);
459                     if (addr > addressLookingFor) {
460                         followDownAddressSearchStop = true;
461                         if (firstAddress) {
462                             firstAddress = false;
463                             haveToSearchAll = true;
464                         } else if (!haveToSearchAll) {
465                             break;
466                         }
467                     } else {
468                         followDownBranchAddress = addr;
469                         followingChar = (char)(0xFF & mDict[pos-1]);
470                         if (firstAddress) {
471                             firstAddress = false;
472                             haveToSearchAll = false;
473                         }
474                     }
475                 }
476                 pos += 3;
477             } else if (getFirstBitOfByte(&pos)) { // terminal
478                 if (addressLookingFor == (pos-1)) { // found !!
479                     depth++;
480                     word[depth] = (0xFF & mDict[pos-1]);
481                     found = true;
482                     break;
483                 }
484                 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
485                     if (!followDownAddressSearchStop) {
486                         int addr = getBigramAddress(&pos, false);
487                         if (addr > addressLookingFor) {
488                             followDownAddressSearchStop = true;
489                             if (firstAddress) {
490                                 firstAddress = false;
491                                 haveToSearchAll = true;
492                             } else if (!haveToSearchAll) {
493                                 break;
494                             }
495                         } else {
496                             followDownBranchAddress = addr;
497                             followingChar = (char)(0xFF & mDict[pos-1]);
498                             if (firstAddress) {
499                                 firstAddress = false;
500                                 haveToSearchAll = true;
501                             }
502                         }
503                     }
504                     pos += 4;
505                 } else { // freq only (2 byte)
506                     pos += 2;
507                 }
508 
509                 // skipping bigram
510                 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
511                 if (bigramExist > 0) {
512                     int nextBigramExist = 1;
513                     while (nextBigramExist > 0) {
514                         pos += 3;
515                         nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
516                     }
517                 } else {
518                     pos++;
519                 }
520             }
521         }
522         depth++;
523         if (followDownBranchAddress == 0) {
524             LOGI("ERROR!!! Cannot find bigram!!");
525             break;
526         }
527     }
528     if (checkFirstCharacter(word)) {
529         addWordBigram(word, depth, frequency);
530     }
531 }
532 
533 bool
checkFirstCharacter(unsigned short * word)534 Dictionary::checkFirstCharacter(unsigned short *word)
535 {
536     // Checks whether this word starts with same character or neighboring characters of
537     // what user typed.
538 
539     int *inputCodes = mInputCodes;
540     int maxAlt = mMaxAlternatives;
541     while (maxAlt > 0) {
542         if ((unsigned int) *inputCodes == (unsigned int) *word) {
543             return true;
544         }
545         inputCodes++;
546         maxAlt--;
547     }
548     return false;
549 }
550 
551 bool
isValidWord(unsigned short * word,int length)552 Dictionary::isValidWord(unsigned short *word, int length)
553 {
554     if (checkIfDictVersionIsLatest()) {
555         return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
556     } else {
557         return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
558     }
559 }
560 
561 int
isValidWordRec(int pos,unsigned short * word,int offset,int length)562 Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
563     // returns address of bigram data of that word
564     // return -99 if not found
565 
566     int count = getCount(&pos);
567     unsigned short currentChar = (unsigned short) word[offset];
568     for (int j = 0; j < count; j++) {
569         unsigned short c = getChar(&pos);
570         int terminal = getTerminal(&pos);
571         int childPos = getAddress(&pos);
572         if (c == currentChar) {
573             if (offset == length - 1) {
574                 if (terminal) {
575                     return (pos+1);
576                 }
577             } else {
578                 if (childPos != 0) {
579                     int t = isValidWordRec(childPos, word, offset + 1, length);
580                     if (t > 0) {
581                         return t;
582                     }
583                 }
584             }
585         }
586         if (terminal) {
587             getFreq(&pos);
588         }
589         // There could be two instances of each alphabet - upper and lower case. So continue
590         // looking ...
591     }
592     return NOT_VALID_WORD;
593 }
594 
595 
596 } // namespace latinime
597