• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <cstring>
18 
19 #define LOG_TAG "LatinIME: unigram_dictionary.cpp"
20 
21 #include "binary_format.h"
22 #include "char_utils.h"
23 #include "defines.h"
24 #include "dictionary.h"
25 #include "digraph_utils.h"
26 #include "proximity_info.h"
27 #include "terminal_attributes.h"
28 #include "unigram_dictionary.h"
29 #include "words_priority_queue.h"
30 #include "words_priority_queue_pool.h"
31 
32 namespace latinime {
33 
34 // TODO: check the header
UnigramDictionary(const uint8_t * const streamStart,const unsigned int dictFlags)35 UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags)
36         : DICT_ROOT(streamStart), ROOT_POS(0),
37           MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) {
38     if (DEBUG_DICT) {
39         AKLOGI("UnigramDictionary - constructor");
40     }
41 }
42 
~UnigramDictionary()43 UnigramDictionary::~UnigramDictionary() {
44 }
45 
46 // TODO: This needs to take a const int* and not tinker with its contents
addWord(int * word,int length,int probability,WordsPriorityQueue * queue,int type)47 static void addWord(int *word, int length, int probability, WordsPriorityQueue *queue, int type) {
48     queue->push(probability, word, length, type);
49 }
50 
51 // Return the replacement code point for a digraph, or 0 if none.
getDigraphReplacement(const int * codes,const int i,const int inputSize,const DigraphUtils::digraph_t * const digraphs,const unsigned int digraphsSize) const52 int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int inputSize,
53         const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const {
54 
55     // There can't be a digraph if we don't have at least 2 characters to examine
56     if (i + 2 > inputSize) return false;
57 
58     // Search for the first char of some digraph
59     int lastDigraphIndex = -1;
60     const int thisChar = codes[i];
61     for (lastDigraphIndex = digraphsSize - 1; lastDigraphIndex >= 0; --lastDigraphIndex) {
62         if (thisChar == digraphs[lastDigraphIndex].first) break;
63     }
64     // No match: return early
65     if (lastDigraphIndex < 0) return 0;
66 
67     // It's an interesting digraph if the second char matches too.
68     if (digraphs[lastDigraphIndex].second == codes[i + 1]) {
69         return digraphs[lastDigraphIndex].compositeGlyph;
70     } else {
71         return 0;
72     }
73 }
74 
75 // Mostly the same arguments as the non-recursive version, except:
76 // codes is the original value. It points to the start of the work buffer, and gets passed as is.
77 // inputSize is the size of the user input (thus, it is the size of codesSrc).
78 // codesDest is the current point in the work buffer.
79 // codesSrc is the current point in the user-input, original, content-unmodified buffer.
80 // codesRemain is the remaining size in codesSrc.
getWordWithDigraphSuggestionsRec(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codesBuffer,int * xCoordinatesBuffer,int * yCoordinatesBuffer,const int codesBufferSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,const int * codesSrc,const int codesRemain,const int currentDepth,int * codesDest,Correction * correction,WordsPriorityQueuePool * queuePool,const DigraphUtils::digraph_t * const digraphs,const unsigned int digraphsSize) const81 void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
82         const int *xcoordinates, const int *ycoordinates, const int *codesBuffer,
83         int *xCoordinatesBuffer, int *yCoordinatesBuffer,
84         const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
85         const bool useFullEditDistance, const int *codesSrc,
86         const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
87         WordsPriorityQueuePool *queuePool,
88         const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const {
89     ASSERT(sizeof(codesDest[0]) == sizeof(codesSrc[0]));
90     ASSERT(sizeof(xCoordinatesBuffer[0]) == sizeof(xcoordinates[0]));
91     ASSERT(sizeof(yCoordinatesBuffer[0]) == sizeof(ycoordinates[0]));
92 
93     const int startIndex = static_cast<int>(codesDest - codesBuffer);
94     if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) {
95         for (int i = 0; i < codesRemain; ++i) {
96             xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i];
97             yCoordinatesBuffer[startIndex + i] = ycoordinates[codesBufferSize - codesRemain + i];
98             const int replacementCodePoint =
99                     getDigraphReplacement(codesSrc, i, codesRemain, digraphs, digraphsSize);
100             if (0 != replacementCodePoint) {
101                 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
102 
103                 // Copy the word up to the first char of the digraph, including proximity chars,
104                 // and overwrite the primary code with the replacement code point. Then, continue
105                 // processing on the remaining part of the word, skipping the second char of the
106                 // digraph.
107                 // In our example, copy "pru", replace "u" with the version with the diaeresis and
108                 // continue running on "fen".
109                 // Make i the index of the second char of the digraph for simplicity. Forgetting
110                 // to do that results in an infinite recursion so take care!
111                 ++i;
112                 memcpy(codesDest, codesSrc, i * sizeof(codesDest[0]));
113                 codesDest[i - 1] = replacementCodePoint;
114                 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
115                         codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
116                         bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1,
117                         codesRemain - i - 1, currentDepth + 1, codesDest + i, correction,
118                         queuePool, digraphs, digraphsSize);
119 
120                 // Copy the second char of the digraph in place, then continue processing on
121                 // the remaining part of the word.
122                 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
123                 memcpy(codesDest + i, codesSrc + i, sizeof(codesDest[0]));
124                 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
125                         codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
126                         bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i,
127                         currentDepth + 1, codesDest + i, correction, queuePool, digraphs,
128                         digraphsSize);
129                 return;
130             }
131         }
132     }
133 
134     // If we come here, we hit the end of the word: let's check it against the dictionary.
135     // In our example, we'll come here once for "prufen" and then once for "pruefen".
136     // If the word contains several digraphs, we'll come it for the product of them.
137     // eg. if the word is "ueberpruefen" we'll test, in order, against
138     // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
139     const unsigned int remainingBytes = sizeof(codesDest[0]) * codesRemain;
140     if (0 != remainingBytes) {
141         memcpy(codesDest, codesSrc, remainingBytes);
142         memcpy(&xCoordinatesBuffer[startIndex], &xcoordinates[codesBufferSize - codesRemain],
143                 sizeof(xCoordinatesBuffer[0]) * codesRemain);
144         memcpy(&yCoordinatesBuffer[startIndex], &ycoordinates[codesBufferSize - codesRemain],
145                 sizeof(yCoordinatesBuffer[0]) * codesRemain);
146     }
147 
148     getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer,
149             startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction,
150             queuePool);
151 }
152 
153 // bigramMap contains the association <bigram address> -> <bigram probability>
154 // bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter
155 // in bigram_dictionary.cpp
getSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * inputCodePoints,const int inputSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,int * outWords,int * frequencies,int * outputTypes) const156 int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
157         const int *ycoordinates, const int *inputCodePoints, const int inputSize,
158         const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
159         const bool useFullEditDistance, int *outWords, int *frequencies, int *outputTypes) const {
160     WordsPriorityQueuePool queuePool(MAX_RESULTS, SUB_QUEUE_MAX_WORDS);
161     queuePool.clearAll();
162     Correction masterCorrection;
163     masterCorrection.resetCorrection();
164     const DigraphUtils::digraph_t *digraphs = 0;
165     const int digraphsSize =
166             DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(DICT_FLAGS, &digraphs);
167     if (digraphsSize > 0)
168     { // Incrementally tune the word and try all possibilities
169         int codesBuffer[sizeof(*inputCodePoints) * inputSize];
170         int xCoordinatesBuffer[inputSize];
171         int yCoordinatesBuffer[inputSize];
172         getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
173                 xCoordinatesBuffer, yCoordinatesBuffer, inputSize, bigramMap, bigramFilter,
174                 useFullEditDistance, inputCodePoints, inputSize, 0, codesBuffer, &masterCorrection,
175                 &queuePool, digraphs, digraphsSize);
176     } else { // Normal processing
177         getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, inputSize,
178                 bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool);
179     }
180 
181     PROF_START(20);
182     if (DEBUG_DICT) {
183         float ns = queuePool.getMasterQueue()->getHighestNormalizedScore(
184                 masterCorrection.getPrimaryInputWord(), inputSize, 0, 0, 0);
185         ns += 0;
186         AKLOGI("Max normalized score = %f", ns);
187     }
188     const int suggestedWordsCount =
189             queuePool.getMasterQueue()->outputSuggestions(masterCorrection.getPrimaryInputWord(),
190                     inputSize, frequencies, outWords, outputTypes);
191 
192     if (DEBUG_DICT) {
193         float ns = queuePool.getMasterQueue()->getHighestNormalizedScore(
194                 masterCorrection.getPrimaryInputWord(), inputSize, 0, 0, 0);
195         ns += 0;
196         AKLOGI("Returning %d words", suggestedWordsCount);
197         /// Print the returned words
198         for (int j = 0; j < suggestedWordsCount; ++j) {
199             int *w = outWords + j * MAX_WORD_LENGTH;
200             char s[MAX_WORD_LENGTH];
201             for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
202             (void)s; // To suppress compiler warning
203             AKLOGI("%s %i", s, frequencies[j]);
204         }
205     }
206     PROF_END(20);
207     PROF_CLOSE;
208     return suggestedWordsCount;
209 }
210 
getWordSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * inputCodePoints,const int inputSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,Correction * correction,WordsPriorityQueuePool * queuePool) const211 void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
212         const int *ycoordinates, const int *inputCodePoints, const int inputSize,
213         const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
214         const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool)
215         const {
216     PROF_OPEN;
217     PROF_START(0);
218     PROF_END(0);
219 
220     PROF_START(1);
221     getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, bigramMap,
222             bigramFilter, useFullEditDistance, inputSize, correction, queuePool);
223     PROF_END(1);
224 
225     PROF_START(2);
226     // Note: This line is intentionally left blank
227     PROF_END(2);
228 
229     PROF_START(3);
230     // Note: This line is intentionally left blank
231     PROF_END(3);
232 
233     PROF_START(4);
234     bool hasAutoCorrectionCandidate = false;
235     WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
236     if (masterQueue->size() > 0) {
237         float nsForMaster = masterQueue->getHighestNormalizedScore(
238                 correction->getPrimaryInputWord(), inputSize, 0, 0, 0);
239         hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD);
240     }
241     PROF_END(4);
242 
243     PROF_START(5);
244     // Multiple word suggestions
245     if (SUGGEST_MULTIPLE_WORDS
246             && inputSize >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
247         getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints,
248                 useFullEditDistance, inputSize, correction, queuePool,
249                 hasAutoCorrectionCandidate);
250     }
251     PROF_END(5);
252 
253     PROF_START(6);
254     // Note: This line is intentionally left blank
255     PROF_END(6);
256 
257     if (DEBUG_DICT) {
258         queuePool->dumpSubQueue1TopSuggestions();
259         for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
260             WordsPriorityQueue *queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i);
261             if (queue->size() > 0) {
262                 WordsPriorityQueue::SuggestedWord *sw = queue->top();
263                 const int score = sw->mScore;
264                 const int *word = sw->mWord;
265                 const int wordLength = sw->mWordLength;
266                 float ns = Correction::RankingAlgorithm::calcNormalizedScore(
267                         correction->getPrimaryInputWord(), i, word, wordLength, score);
268                 ns += 0;
269                 AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
270                         (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD));
271                 DUMP_WORD(correction->getPrimaryInputWord(), i);
272                 DUMP_WORD(word, wordLength);
273             }
274         }
275     }
276 }
277 
initSuggestions(ProximityInfo * proximityInfo,const int * xCoordinates,const int * yCoordinates,const int * codes,const int inputSize,Correction * correction) const278 void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates,
279         const int *yCoordinates, const int *codes, const int inputSize,
280         Correction *correction) const {
281     if (DEBUG_DICT) {
282         AKLOGI("initSuggest");
283         DUMP_WORD(codes, inputSize);
284     }
285     correction->initInputParams(proximityInfo, codes, inputSize, xCoordinates, yCoordinates);
286     const int maxDepth = min(inputSize * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
287     correction->initCorrection(proximityInfo, inputSize, maxDepth);
288 }
289 
getOneWordSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,const int inputSize,Correction * correction,WordsPriorityQueuePool * queuePool) const290 void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo,
291         const int *xcoordinates, const int *ycoordinates, const int *codes,
292         const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
293         const bool useFullEditDistance, const int inputSize,
294         Correction *correction, WordsPriorityQueuePool *queuePool) const {
295     initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputSize, correction);
296     getSuggestionCandidates(useFullEditDistance, inputSize, bigramMap, bigramFilter, correction,
297             queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX);
298 }
299 
getSuggestionCandidates(const bool useFullEditDistance,const int inputSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,Correction * correction,WordsPriorityQueuePool * queuePool,const bool doAutoCompletion,const int maxErrors,const int currentWordIndex) const300 void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
301         const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
302         Correction *correction, WordsPriorityQueuePool *queuePool,
303         const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) const {
304     uint8_t totalTraverseCount = correction->pushAndGetTotalTraverseCount();
305     if (DEBUG_DICT) {
306         AKLOGI("Traverse count %d", totalTraverseCount);
307     }
308     if (totalTraverseCount > MULTIPLE_WORDS_SUGGESTION_MAX_TOTAL_TRAVERSE_COUNT) {
309         if (DEBUG_DICT) {
310             AKLOGI("Abort traversing %d", totalTraverseCount);
311         }
312         return;
313     }
314     // TODO: Remove setCorrectionParams
315     correction->setCorrectionParams(0, 0, 0,
316             -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
317             doAutoCompletion, maxErrors);
318     int rootPosition = ROOT_POS;
319     // Get the number of children of root, then increment the position
320     int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition);
321     int outputIndex = 0;
322 
323     correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0));
324 
325     // Depth first search
326     while (outputIndex >= 0) {
327         if (correction->initProcessState(outputIndex)) {
328             int siblingPos = correction->getTreeSiblingPos(outputIndex);
329             int firstChildPos;
330 
331             const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
332                     bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos,
333                     queuePool, currentWordIndex);
334             // Update next sibling pos
335             correction->setTreeSiblingPos(outputIndex, siblingPos);
336 
337             if (needsToTraverseChildrenNodes) {
338                 // Goes to child node
339                 outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos);
340             }
341         } else {
342             // Goes to parent sibling node
343             outputIndex = correction->getTreeParentIndex(outputIndex);
344         }
345     }
346 }
347 
onTerminal(const int probability,const TerminalAttributes & terminalAttributes,Correction * correction,WordsPriorityQueuePool * queuePool,const bool addToMasterQueue,const int currentWordIndex) const348 void UnigramDictionary::onTerminal(const int probability,
349         const TerminalAttributes &terminalAttributes, Correction *correction,
350         WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
351         const int currentWordIndex) const {
352     const int inputIndex = correction->getInputIndex();
353     const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT;
354 
355     int wordLength;
356     int *wordPointer;
357 
358     if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) {
359         WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
360         const int finalProbability =
361                 correction->getFinalProbability(probability, &wordPointer, &wordLength);
362 
363         if (0 != finalProbability && !terminalAttributes.isBlacklistedOrNotAWord()) {
364             // If the probability is 0, we don't want to add this word. However we still
365             // want to add its shortcuts (including a possible whitelist entry) if any.
366             // Furthermore, if this is not a word (shortcut only for example) or a blacklisted
367             // entry then we never want to suggest this.
368             addWord(wordPointer, wordLength, finalProbability, masterQueue,
369                     Dictionary::KIND_CORRECTION);
370         }
371 
372         const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0;
373         // Please note that the shortcut candidates will be added to the master queue only.
374         TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator();
375         while (iterator.hasNextShortcutTarget()) {
376             // TODO: addWord only supports weak ordering, meaning we have no means
377             // to control the order of the shortcuts relative to one another or to the word.
378             // We need to either modulate the probability of each shortcut according
379             // to its own shortcut probability or to make the queue
380             // so that the insert order is protected inside the queue for words
381             // with the same score. For the moment we use -1 to make sure the shortcut will
382             // never be in front of the word.
383             int shortcutTarget[MAX_WORD_LENGTH];
384             int shortcutFrequency;
385             const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
386                     MAX_WORD_LENGTH, shortcutTarget, &shortcutFrequency);
387             int shortcutScore;
388             int kind;
389             if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_PROBABILITY
390                     && correction->sameAsTyped()) {
391                 shortcutScore = S_INT_MAX;
392                 kind = Dictionary::KIND_WHITELIST;
393             } else {
394                 shortcutScore = shortcutProbability;
395                 kind = Dictionary::KIND_CORRECTION;
396             }
397             addWord(shortcutTarget, shortcutTargetStringLength, shortcutScore,
398                     masterQueue, kind);
399         }
400     }
401 
402     // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH
403     // or more length.
404     if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) {
405         WordsPriorityQueue *subQueue;
406         subQueue = queuePool->getSubQueue(currentWordIndex, inputIndex);
407         if (!subQueue) {
408             return;
409         }
410         const int finalProbability = correction->getFinalProbabilityForSubQueue(
411                 probability, &wordPointer, &wordLength, inputIndex);
412         addWord(wordPointer, wordLength, finalProbability, subQueue, Dictionary::KIND_CORRECTION);
413     }
414 }
415 
getSubStringSuggestion(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const bool useFullEditDistance,Correction * correction,WordsPriorityQueuePool * queuePool,const int inputSize,const bool hasAutoCorrectionCandidate,const int currentWordIndex,const int inputWordStartPos,const int inputWordLength,const int outputWordStartPos,const bool isSpaceProximity,int * freqArray,int * wordLengthArray,int * outputWord,int * outputWordLength) const416 int UnigramDictionary::getSubStringSuggestion(
417         ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
418         const int *codes, const bool useFullEditDistance, Correction *correction,
419         WordsPriorityQueuePool *queuePool, const int inputSize,
420         const bool hasAutoCorrectionCandidate, const int currentWordIndex,
421         const int inputWordStartPos, const int inputWordLength,
422         const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
423         int *wordLengthArray, int *outputWord, int *outputWordLength) const {
424     if (inputWordLength > MULTIPLE_WORDS_SUGGESTION_MAX_WORD_LENGTH) {
425         return FLAG_MULTIPLE_SUGGEST_ABORT;
426     }
427 
428     /////////////////////////////////////////////
429     // safety net for multiple word suggestion //
430     // TODO: Remove this safety net            //
431     /////////////////////////////////////////////
432     int smallWordCount = 0;
433     int singleLetterWordCount = 0;
434     if (inputWordLength == 1) {
435         ++singleLetterWordCount;
436     }
437     if (inputWordLength <= 2) {
438         // small word == single letter or 2-letter word
439         ++smallWordCount;
440     }
441     for (int i = 0; i < currentWordIndex; ++i) {
442         const int length = wordLengthArray[i];
443         if (length == 1) {
444             ++singleLetterWordCount;
445             // Safety net to avoid suggesting sequential single letter words
446             if (i < (currentWordIndex - 1)) {
447                 if (wordLengthArray[i + 1] == 1) {
448                     return FLAG_MULTIPLE_SUGGEST_ABORT;
449                 }
450             } else if (inputWordLength == 1) {
451                 return FLAG_MULTIPLE_SUGGEST_ABORT;
452             }
453         }
454         if (length <= 2) {
455             ++smallWordCount;
456         }
457         // Safety net to avoid suggesting multiple words with many (4 or more, for now) small words
458         if (singleLetterWordCount >= 3 || smallWordCount >= 4) {
459             return FLAG_MULTIPLE_SUGGEST_ABORT;
460         }
461     }
462     //////////////////////////////////////////////
463     // TODO: Remove the safety net above        //
464     //////////////////////////////////////////////
465 
466     int *tempOutputWord = 0;
467     int nextWordLength = 0;
468     // TODO: Optimize init suggestion
469     initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
470             inputSize, correction);
471 
472     int word[MAX_WORD_LENGTH];
473     int freq = getMostProbableWordLike(
474             inputWordStartPos, inputWordLength, correction, word);
475     if (freq > 0) {
476         nextWordLength = inputWordLength;
477         tempOutputWord = word;
478     } else if (!hasAutoCorrectionCandidate) {
479         if (inputWordStartPos > 0) {
480             const int offset = inputWordStartPos;
481             initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset],
482                     codes + offset, inputWordLength, correction);
483             queuePool->clearSubQueue(currentWordIndex);
484             // TODO: pass the bigram list for substring suggestion
485             getSuggestionCandidates(useFullEditDistance, inputWordLength,
486                     0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool,
487                     false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
488             if (DEBUG_DICT) {
489                 if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
490                     AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
491                     for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
492                         queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord();
493                     }
494                 }
495             }
496         }
497         WordsPriorityQueue *queue = queuePool->getSubQueue(currentWordIndex, inputWordLength);
498         // TODO: Return the correct value depending on doAutoCompletion
499         if (!queue || queue->size() <= 0) {
500             return FLAG_MULTIPLE_SUGGEST_ABORT;
501         }
502         int score = 0;
503         const float ns = queue->getHighestNormalizedScore(
504                 correction->getPrimaryInputWord(), inputWordLength,
505                 &tempOutputWord, &score, &nextWordLength);
506         if (DEBUG_DICT) {
507             AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score);
508         }
509         // Two words correction won't be done if the score of the first word doesn't exceed the
510         // threshold.
511         if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
512                 || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
513             return FLAG_MULTIPLE_SUGGEST_SKIP;
514         }
515         freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
516     }
517     if (DEBUG_DICT) {
518         AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)",
519                 currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos,
520                 (currentWordIndex > 0) ? wordLengthArray[0] : 0);
521     }
522     if (freq <= 0 || nextWordLength <= 0
523             || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) {
524         return FLAG_MULTIPLE_SUGGEST_SKIP;
525     }
526     for (int i = 0; i < nextWordLength; ++i) {
527         outputWord[outputWordStartPos + i] = tempOutputWord[i];
528     }
529 
530     // Put output values
531     freqArray[currentWordIndex] = freq;
532     // TODO: put output length instead of input length
533     wordLengthArray[currentWordIndex] = inputWordLength;
534     const int tempOutputWordLength = outputWordStartPos + nextWordLength;
535     if (outputWordLength) {
536         *outputWordLength = tempOutputWordLength;
537     }
538 
539     if ((inputWordStartPos + inputWordLength) < inputSize) {
540         if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
541             return FLAG_MULTIPLE_SUGGEST_SKIP;
542         }
543         outputWord[tempOutputWordLength] = KEYCODE_SPACE;
544         if (outputWordLength) {
545             ++*outputWordLength;
546         }
547     } else if (currentWordIndex >= 1) {
548         // TODO: Handle 3 or more words
549         const int pairFreq = correction->getFreqForSplitMultipleWords(
550                 freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
551         if (DEBUG_DICT) {
552             DUMP_WORD(outputWord, tempOutputWordLength);
553             for (int i = 0; i < currentWordIndex + 1; ++i) {
554                 AKLOGI("Split %d,%d words: freq = %d, length = %d", i, currentWordIndex + 1,
555                         freqArray[i], wordLengthArray[i]);
556             }
557             AKLOGI("Split two words: freq = %d, length = %d, %d, isSpace ? %d", pairFreq,
558                     inputSize, tempOutputWordLength, isSpaceProximity);
559         }
560         addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue(),
561                 Dictionary::KIND_CORRECTION);
562     }
563     return FLAG_MULTIPLE_SUGGEST_CONTINUE;
564 }
565 
getMultiWordsSuggestionRec(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const bool useFullEditDistance,const int inputSize,Correction * correction,WordsPriorityQueuePool * queuePool,const bool hasAutoCorrectionCandidate,const int startInputPos,const int startWordIndex,const int outputWordLength,int * freqArray,int * wordLengthArray,int * outputWord) const566 void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
567         const int *xcoordinates, const int *ycoordinates, const int *codes,
568         const bool useFullEditDistance, const int inputSize, Correction *correction,
569         WordsPriorityQueuePool *queuePool, const bool hasAutoCorrectionCandidate,
570         const int startInputPos, const int startWordIndex, const int outputWordLength,
571         int *freqArray, int *wordLengthArray, int *outputWord) const {
572     if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) {
573         // Return if the last word index
574         return;
575     }
576     if (startWordIndex >= 1
577             && (hasAutoCorrectionCandidate
578                     || inputSize < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
579         // Do not suggest 3+ words if already has auto correction candidate
580         return;
581     }
582     for (int i = startInputPos + 1; i < inputSize; ++i) {
583         if (DEBUG_CORRECTION_FREQ) {
584             AKLOGI("Multi words(%d), start in %d sep %d start out %d",
585                     startWordIndex, startInputPos, i, outputWordLength);
586             DUMP_WORD(outputWord, outputWordLength);
587         }
588         int tempOutputWordLength = 0;
589         // Current word
590         int inputWordStartPos = startInputPos;
591         int inputWordLength = i - startInputPos;
592         const int suggestionFlag = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates,
593                 codes, useFullEditDistance, correction, queuePool, inputSize,
594                 hasAutoCorrectionCandidate, startWordIndex, inputWordStartPos, inputWordLength,
595                 outputWordLength, true /* not used */, freqArray, wordLengthArray, outputWord,
596                 &tempOutputWordLength);
597         if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_ABORT) {
598             // TODO: break here
599             continue;
600         } else if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_SKIP) {
601             continue;
602         }
603 
604         if (DEBUG_CORRECTION_FREQ) {
605             AKLOGI("Do missing space correction");
606         }
607         // Next word
608         // Missing space
609         inputWordStartPos = i;
610         inputWordLength = inputSize - i;
611         if (getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
612                 useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate,
613                 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
614                 false /* missing space */, freqArray, wordLengthArray, outputWord, 0)
615                         != FLAG_MULTIPLE_SUGGEST_CONTINUE) {
616             getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
617                     useFullEditDistance, inputSize, correction, queuePool,
618                     hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
619                     tempOutputWordLength, freqArray, wordLengthArray, outputWord);
620         }
621 
622         // Mistyped space
623         ++inputWordStartPos;
624         --inputWordLength;
625 
626         if (inputWordLength <= 0) {
627             continue;
628         }
629 
630         const int x = xcoordinates[inputWordStartPos - 1];
631         const int y = ycoordinates[inputWordStartPos - 1];
632         if (!proximityInfo->hasSpaceProximity(x, y)) {
633             continue;
634         }
635 
636         if (DEBUG_CORRECTION_FREQ) {
637             AKLOGI("Do mistyped space correction");
638         }
639         getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
640                 useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate,
641                 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
642                 true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
643     }
644 }
645 
getSplitMultipleWordsSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const bool useFullEditDistance,const int inputSize,Correction * correction,WordsPriorityQueuePool * queuePool,const bool hasAutoCorrectionCandidate) const646 void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
647         const int *xcoordinates, const int *ycoordinates, const int *codes,
648         const bool useFullEditDistance, const int inputSize,
649         Correction *correction, WordsPriorityQueuePool *queuePool,
650         const bool hasAutoCorrectionCandidate) const {
651     if (inputSize >= MAX_WORD_LENGTH) return;
652     if (DEBUG_DICT) {
653         AKLOGI("--- Suggest multiple words");
654     }
655 
656     // Allocating fixed length array on stack
657     int outputWord[MAX_WORD_LENGTH];
658     int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
659     int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
660     const int outputWordLength = 0;
661     const int startInputPos = 0;
662     const int startWordIndex = 0;
663     getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
664             useFullEditDistance, inputSize, correction, queuePool, hasAutoCorrectionCandidate,
665             startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray,
666             outputWord);
667 }
668 
669 // Wrapper for getMostProbableWordLikeInner, which matches it to the previous
670 // interface.
getMostProbableWordLike(const int startInputIndex,const int inputSize,Correction * correction,int * word) const671 int UnigramDictionary::getMostProbableWordLike(const int startInputIndex, const int inputSize,
672         Correction *correction, int *word) const {
673     int inWord[inputSize];
674     for (int i = 0; i < inputSize; ++i) {
675         inWord[i] = correction->getPrimaryCodePointAt(startInputIndex + i);
676     }
677     return getMostProbableWordLikeInner(inWord, inputSize, word);
678 }
679 
680 // This function will take the position of a character array within a CharGroup,
681 // and check it actually like-matches the word in inWord starting at startInputIndex,
682 // that is, it matches it with case and accents squashed.
683 // The function returns true if there was a full match, false otherwise.
684 // The function will copy on-the-fly the characters in the CharGroup to outNewWord.
685 // It will also place the end position of the array in outPos; in outInputIndex,
686 // it will place the index of the first char AFTER the match if there was a match,
687 // and the initial position if there was not. It makes sense because if there was
688 // a match we want to continue searching, but if there was not, we want to go to
689 // the next CharGroup.
690 // In and out parameters may point to the same location. This function takes care
691 // not to use any input parameters after it wrote into its outputs.
testCharGroupForContinuedLikeness(const uint8_t flags,const uint8_t * const root,const int startPos,const int * const inWord,const int startInputIndex,const int inputSize,int * outNewWord,int * outInputIndex,int * outPos)692 static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
693         const uint8_t *const root, const int startPos, const int *const inWord,
694         const int startInputIndex, const int inputSize, int *outNewWord, int *outInputIndex,
695         int *outPos) {
696     const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
697     int pos = startPos;
698     int codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
699     int baseChar = toBaseLowerCase(codePoint);
700     const int wChar = toBaseLowerCase(inWord[startInputIndex]);
701 
702     if (baseChar != wChar) {
703         *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
704         *outInputIndex = startInputIndex;
705         return false;
706     }
707     int inputIndex = startInputIndex;
708     outNewWord[inputIndex] = codePoint;
709     if (hasMultipleChars) {
710         codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
711         while (NOT_A_CODE_POINT != codePoint) {
712             baseChar = toBaseLowerCase(codePoint);
713             if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
714                 *outPos = BinaryFormat::skipOtherCharacters(root, pos);
715                 *outInputIndex = startInputIndex;
716                 return false;
717             }
718             outNewWord[inputIndex] = codePoint;
719             codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
720         }
721     }
722     *outInputIndex = inputIndex + 1;
723     *outPos = pos;
724     return true;
725 }
726 
727 // This function is invoked when a word like the word searched for is found.
728 // It will compare the probability to the max probability, and if greater, will
729 // copy the word into the output buffer. In output value maxFreq, it will
730 // write the new maximum probability if it changed.
onTerminalWordLike(const int freq,int * newWord,const int length,int * outWord,int * maxFreq)731 static inline void onTerminalWordLike(const int freq, int *newWord, const int length, int *outWord,
732         int *maxFreq) {
733     if (freq > *maxFreq) {
734         for (int q = 0; q < length; ++q) {
735             outWord[q] = newWord[q];
736         }
737         outWord[length] = 0;
738         *maxFreq = freq;
739     }
740 }
741 
742 // Will find the highest probability of the words like the one passed as an argument,
743 // that is, everything that only differs by case/accents.
getMostProbableWordLikeInner(const int * const inWord,const int inputSize,int * outWord) const744 int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, const int inputSize,
745         int *outWord) const {
746     int newWord[MAX_WORD_LENGTH];
747     int depth = 0;
748     int maxFreq = -1;
749     const uint8_t *const root = DICT_ROOT;
750     int stackChildCount[MAX_WORD_LENGTH];
751     int stackInputIndex[MAX_WORD_LENGTH];
752     int stackSiblingPos[MAX_WORD_LENGTH];
753 
754     int startPos = 0;
755     stackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos);
756     stackInputIndex[0] = 0;
757     stackSiblingPos[0] = startPos;
758     while (depth >= 0) {
759         const int charGroupCount = stackChildCount[depth];
760         int pos = stackSiblingPos[depth];
761         for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
762             int inputIndex = stackInputIndex[depth];
763             const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
764             // Test whether all chars in this group match with the word we are searching for. If so,
765             // we want to traverse its children (or if the inputSize match, evaluate its
766             // probability). Note that this function will output the position regardless, but will
767             // only write into inputIndex if there is a match.
768             const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
769                     inputIndex, inputSize, newWord, &inputIndex, &pos);
770             if (isAlike && (!(BinaryFormat::FLAG_IS_NOT_A_WORD & flags))
771                     && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) {
772                 const int probability =
773                         BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
774                 onTerminalWordLike(probability, newWord, inputIndex, outWord, &maxFreq);
775             }
776             pos = BinaryFormat::skipProbability(flags, pos);
777             const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
778             const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
779             // If we had a match and the word has children, we want to traverse them. We don't have
780             // to traverse words longer than the one we are searching for, since they will not match
781             // anyway, so don't traverse unless inputIndex < inputSize.
782             if (isAlike && (-1 != childrenNodePos) && (inputIndex < inputSize)) {
783                 // Save position for this depth, to get back to this once children are done
784                 stackChildCount[depth] = charGroupIndex;
785                 stackSiblingPos[depth] = siblingPos;
786                 // Prepare stack values for next depth
787                 ++depth;
788                 int childrenPos = childrenNodePos;
789                 stackChildCount[depth] =
790                         BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
791                 stackSiblingPos[depth] = childrenPos;
792                 stackInputIndex[depth] = inputIndex;
793                 pos = childrenPos;
794                 // Go to the next depth level.
795                 ++depth;
796                 break;
797             } else {
798                 // No match, or no children, or word too long to ever match: go the next sibling.
799                 pos = siblingPos;
800             }
801         }
802         --depth;
803     }
804     return maxFreq;
805 }
806 
getProbability(const int * const inWord,const int length) const807 int UnigramDictionary::getProbability(const int *const inWord, const int length) const {
808     const uint8_t *const root = DICT_ROOT;
809     int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
810             false /* forceLowerCaseSearch */);
811     if (NOT_VALID_WORD == pos) {
812         return NOT_A_PROBABILITY;
813     }
814     const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
815     if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) {
816         // If this is not a word, or if it's a blacklisted entry, it should behave as
817         // having no probability outside of the suggestion process (where it should be used
818         // for shortcuts).
819         return NOT_A_PROBABILITY;
820     }
821     const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
822     if (hasMultipleChars) {
823         pos = BinaryFormat::skipOtherCharacters(root, pos);
824     } else {
825         BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
826     }
827     const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
828     return unigramProbability;
829 }
830 
831 // TODO: remove this function.
getBigramPosition(int pos,int * word,int offset,int length) const832 int UnigramDictionary::getBigramPosition(int pos, int *word, int offset, int length) const {
833     return -1;
834 }
835 
836 // ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
837 // If the return value is false, then the caller should read in the output "nextSiblingPosition"
838 // to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
839 // It is worthy to note that when false is returned, the output values other than
840 // nextSiblingPosition are undefined.
841 // If the return value is true, then the caller must proceed to traverse the children of this
842 // node. processCurrentNode will output the information about the children: their count in
843 // newCount, their position in newChildrenPosition, the traverseAllNodes flag in
844 // newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
845 // diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
846 // newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
847 // there aren't any more nodes at this level, it merely returns the address of the first byte after
848 // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
849 // given level, as output into newCount when traversing this level's parent.
processCurrentNode(const int initialPos,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,Correction * correction,int * newCount,int * newChildrenPosition,int * nextSiblingPosition,WordsPriorityQueuePool * queuePool,const int currentWordIndex) const850 bool UnigramDictionary::processCurrentNode(const int initialPos,
851         const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction,
852         int *newCount, int *newChildrenPosition, int *nextSiblingPosition,
853         WordsPriorityQueuePool *queuePool, const int currentWordIndex) const {
854     if (DEBUG_DICT) {
855         correction->checkState();
856     }
857     int pos = initialPos;
858 
859     // Flags contain the following information:
860     // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
861     //   - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
862     //     is on the specified number of bytes.
863     //   - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
864     // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
865     // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
866     // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
867     const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
868     const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
869     const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
870 
871     bool needsToInvokeOnTerminal = false;
872 
873     // This gets only ONE character from the stream. Next there will be:
874     // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
875     // else if FLAG_IS_TERMINAL: the probability
876     // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
877     // Note that you can't have a node that both is not a terminal and has no children.
878     int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
879     ASSERT(NOT_A_CODE_POINT != c);
880 
881     // We are going to loop through each character and make it look like it's a different
882     // node each time. To do that, we will process characters in this node in order until
883     // we find the character terminator. This is signalled by getCodePoint* returning
884     // NOT_A_CODE_POINT.
885     // As a special case, if there is only one character in this node, we must not read the
886     // next bytes so we will simulate the NOT_A_CODE_POINT return by testing the flags.
887     // This way, each loop run will look like a "virtual node".
888     do {
889         // We prefetch the next char. If 'c' is the last char of this node, we will have
890         // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node
891         // should behave as a terminal or not and whether we have children.
892         const int nextc = hasMultipleChars
893                 ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT;
894         const bool isLastChar = (NOT_A_CODE_POINT == nextc);
895         // If there are more chars in this nodes, then this virtual node is not a terminal.
896         // If we are on the last char, this virtual node is a terminal if this node is.
897         const bool isTerminal = isLastChar && isTerminalNode;
898 
899         Correction::CorrectionType stateType = correction->processCharAndCalcState(
900                 c, isTerminal);
901         if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL
902                 || stateType == Correction::ON_TERMINAL) {
903             needsToInvokeOnTerminal = true;
904         } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) {
905             // We found that this is an unrelated character, so we should give up traversing
906             // this node and its children entirely.
907             // However we may not be on the last virtual node yet so we skip the remaining
908             // characters in this node, the probability if it's there, read the next sibling
909             // position to output it, then return false.
910             // We don't have to output other values because we return false, as in
911             // "don't traverse children".
912             if (!isLastChar) {
913                 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
914             }
915             pos = BinaryFormat::skipProbability(flags, pos);
916             *nextSiblingPosition =
917                     BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
918             return false;
919         }
920 
921         // Prepare for the next character. Promote the prefetched char to current char - the loop
922         // will take care of prefetching the next. If we finally found our last char, nextc will
923         // contain NOT_A_CODE_POINT.
924         c = nextc;
925     } while (NOT_A_CODE_POINT != c);
926 
927     if (isTerminalNode) {
928         // The probability should be here, because we come here only if this is actually
929         // a terminal node, and we are on its last char.
930         const int unigramProbability =
931                 BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos);
932         const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos);
933         const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
934         TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
935         // bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
936         // bigramFilter is a bloom filter of said frequencies for even faster rejection.
937         const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
938                 unigramProbability);
939         onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
940                 currentWordIndex);
941 
942         // If there are more chars in this node, then this virtual node has children.
943         // If we are on the last char, this virtual node has children if this node has.
944         const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
945 
946         // This character matched the typed character (enough to traverse the node at least)
947         // so we just evaluated it. Now we should evaluate this virtual node's children - that
948         // is, if it has any. If it has no children, we're done here - so we skip the end of
949         // the node, output the siblings position, and return false "don't traverse children".
950         // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
951         // remaining char in this group for there can't be any.
952         if (!hasChildren) {
953             pos = BinaryFormat::skipProbability(flags, pos);
954             *nextSiblingPosition =
955                     BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
956             return false;
957         }
958 
959         // Optimization: Prune out words that are too long compared to how much was typed.
960         if (correction->needsToPrune()) {
961             pos = BinaryFormat::skipProbability(flags, pos);
962             *nextSiblingPosition =
963                     BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
964             if (DEBUG_DICT_FULL) {
965                 AKLOGI("Traversing was pruned.");
966             }
967             return false;
968         }
969     }
970 
971     // Now we finished processing this node, and we want to traverse children. If there are no
972     // children, we can't come here.
973     ASSERT(BinaryFormat::hasChildrenInFlags(flags));
974 
975     // If this node was a terminal it still has the probability under the pointer (it may have been
976     // read, but not skipped - see readProbabilityWithoutMovingPointer).
977     // Next come the children position, then possibly attributes (attributes are bigrams only for
978     // now, maybe something related to shortcuts in the future).
979     // Once this is read, we still need to output the number of nodes in the immediate children of
980     // this node, so we read and output it before returning true, as in "please traverse children".
981     pos = BinaryFormat::skipProbability(flags, pos);
982     int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
983     *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
984     *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
985     *newChildrenPosition = childrenPos;
986     return true;
987 }
988 } // namespace latinime
989