1 /*
2 * Copyright (C) 2010, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <cstring>
18
19 #define LOG_TAG "LatinIME: unigram_dictionary.cpp"
20
21 #include "binary_format.h"
22 #include "char_utils.h"
23 #include "defines.h"
24 #include "dictionary.h"
25 #include "digraph_utils.h"
26 #include "proximity_info.h"
27 #include "terminal_attributes.h"
28 #include "unigram_dictionary.h"
29 #include "words_priority_queue.h"
30 #include "words_priority_queue_pool.h"
31
32 namespace latinime {
33
34 // TODO: check the header
UnigramDictionary(const uint8_t * const streamStart,const unsigned int dictFlags)35 UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags)
36 : DICT_ROOT(streamStart), ROOT_POS(0),
37 MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) {
38 if (DEBUG_DICT) {
39 AKLOGI("UnigramDictionary - constructor");
40 }
41 }
42
~UnigramDictionary()43 UnigramDictionary::~UnigramDictionary() {
44 }
45
46 // TODO: This needs to take a const int* and not tinker with its contents
addWord(int * word,int length,int probability,WordsPriorityQueue * queue,int type)47 static void addWord(int *word, int length, int probability, WordsPriorityQueue *queue, int type) {
48 queue->push(probability, word, length, type);
49 }
50
51 // Return the replacement code point for a digraph, or 0 if none.
getDigraphReplacement(const int * codes,const int i,const int inputSize,const DigraphUtils::digraph_t * const digraphs,const unsigned int digraphsSize) const52 int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int inputSize,
53 const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const {
54
55 // There can't be a digraph if we don't have at least 2 characters to examine
56 if (i + 2 > inputSize) return false;
57
58 // Search for the first char of some digraph
59 int lastDigraphIndex = -1;
60 const int thisChar = codes[i];
61 for (lastDigraphIndex = digraphsSize - 1; lastDigraphIndex >= 0; --lastDigraphIndex) {
62 if (thisChar == digraphs[lastDigraphIndex].first) break;
63 }
64 // No match: return early
65 if (lastDigraphIndex < 0) return 0;
66
67 // It's an interesting digraph if the second char matches too.
68 if (digraphs[lastDigraphIndex].second == codes[i + 1]) {
69 return digraphs[lastDigraphIndex].compositeGlyph;
70 } else {
71 return 0;
72 }
73 }
74
75 // Mostly the same arguments as the non-recursive version, except:
76 // codes is the original value. It points to the start of the work buffer, and gets passed as is.
77 // inputSize is the size of the user input (thus, it is the size of codesSrc).
78 // codesDest is the current point in the work buffer.
79 // codesSrc is the current point in the user-input, original, content-unmodified buffer.
80 // codesRemain is the remaining size in codesSrc.
getWordWithDigraphSuggestionsRec(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codesBuffer,int * xCoordinatesBuffer,int * yCoordinatesBuffer,const int codesBufferSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,const int * codesSrc,const int codesRemain,const int currentDepth,int * codesDest,Correction * correction,WordsPriorityQueuePool * queuePool,const DigraphUtils::digraph_t * const digraphs,const unsigned int digraphsSize) const81 void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
82 const int *xcoordinates, const int *ycoordinates, const int *codesBuffer,
83 int *xCoordinatesBuffer, int *yCoordinatesBuffer,
84 const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
85 const bool useFullEditDistance, const int *codesSrc,
86 const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
87 WordsPriorityQueuePool *queuePool,
88 const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const {
89 ASSERT(sizeof(codesDest[0]) == sizeof(codesSrc[0]));
90 ASSERT(sizeof(xCoordinatesBuffer[0]) == sizeof(xcoordinates[0]));
91 ASSERT(sizeof(yCoordinatesBuffer[0]) == sizeof(ycoordinates[0]));
92
93 const int startIndex = static_cast<int>(codesDest - codesBuffer);
94 if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) {
95 for (int i = 0; i < codesRemain; ++i) {
96 xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i];
97 yCoordinatesBuffer[startIndex + i] = ycoordinates[codesBufferSize - codesRemain + i];
98 const int replacementCodePoint =
99 getDigraphReplacement(codesSrc, i, codesRemain, digraphs, digraphsSize);
100 if (0 != replacementCodePoint) {
101 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
102
103 // Copy the word up to the first char of the digraph, including proximity chars,
104 // and overwrite the primary code with the replacement code point. Then, continue
105 // processing on the remaining part of the word, skipping the second char of the
106 // digraph.
107 // In our example, copy "pru", replace "u" with the version with the diaeresis and
108 // continue running on "fen".
109 // Make i the index of the second char of the digraph for simplicity. Forgetting
110 // to do that results in an infinite recursion so take care!
111 ++i;
112 memcpy(codesDest, codesSrc, i * sizeof(codesDest[0]));
113 codesDest[i - 1] = replacementCodePoint;
114 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
115 codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
116 bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1,
117 codesRemain - i - 1, currentDepth + 1, codesDest + i, correction,
118 queuePool, digraphs, digraphsSize);
119
120 // Copy the second char of the digraph in place, then continue processing on
121 // the remaining part of the word.
122 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
123 memcpy(codesDest + i, codesSrc + i, sizeof(codesDest[0]));
124 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
125 codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
126 bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i,
127 currentDepth + 1, codesDest + i, correction, queuePool, digraphs,
128 digraphsSize);
129 return;
130 }
131 }
132 }
133
134 // If we come here, we hit the end of the word: let's check it against the dictionary.
135 // In our example, we'll come here once for "prufen" and then once for "pruefen".
136 // If the word contains several digraphs, we'll come it for the product of them.
137 // eg. if the word is "ueberpruefen" we'll test, in order, against
138 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
139 const unsigned int remainingBytes = sizeof(codesDest[0]) * codesRemain;
140 if (0 != remainingBytes) {
141 memcpy(codesDest, codesSrc, remainingBytes);
142 memcpy(&xCoordinatesBuffer[startIndex], &xcoordinates[codesBufferSize - codesRemain],
143 sizeof(xCoordinatesBuffer[0]) * codesRemain);
144 memcpy(&yCoordinatesBuffer[startIndex], &ycoordinates[codesBufferSize - codesRemain],
145 sizeof(yCoordinatesBuffer[0]) * codesRemain);
146 }
147
148 getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer,
149 startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction,
150 queuePool);
151 }
152
153 // bigramMap contains the association <bigram address> -> <bigram probability>
154 // bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter
155 // in bigram_dictionary.cpp
getSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * inputCodePoints,const int inputSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,int * outWords,int * frequencies,int * outputTypes) const156 int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
157 const int *ycoordinates, const int *inputCodePoints, const int inputSize,
158 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
159 const bool useFullEditDistance, int *outWords, int *frequencies, int *outputTypes) const {
160 WordsPriorityQueuePool queuePool(MAX_RESULTS, SUB_QUEUE_MAX_WORDS);
161 queuePool.clearAll();
162 Correction masterCorrection;
163 masterCorrection.resetCorrection();
164 const DigraphUtils::digraph_t *digraphs = 0;
165 const int digraphsSize =
166 DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(DICT_FLAGS, &digraphs);
167 if (digraphsSize > 0)
168 { // Incrementally tune the word and try all possibilities
169 int codesBuffer[sizeof(*inputCodePoints) * inputSize];
170 int xCoordinatesBuffer[inputSize];
171 int yCoordinatesBuffer[inputSize];
172 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
173 xCoordinatesBuffer, yCoordinatesBuffer, inputSize, bigramMap, bigramFilter,
174 useFullEditDistance, inputCodePoints, inputSize, 0, codesBuffer, &masterCorrection,
175 &queuePool, digraphs, digraphsSize);
176 } else { // Normal processing
177 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, inputSize,
178 bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool);
179 }
180
181 PROF_START(20);
182 if (DEBUG_DICT) {
183 float ns = queuePool.getMasterQueue()->getHighestNormalizedScore(
184 masterCorrection.getPrimaryInputWord(), inputSize, 0, 0, 0);
185 ns += 0;
186 AKLOGI("Max normalized score = %f", ns);
187 }
188 const int suggestedWordsCount =
189 queuePool.getMasterQueue()->outputSuggestions(masterCorrection.getPrimaryInputWord(),
190 inputSize, frequencies, outWords, outputTypes);
191
192 if (DEBUG_DICT) {
193 float ns = queuePool.getMasterQueue()->getHighestNormalizedScore(
194 masterCorrection.getPrimaryInputWord(), inputSize, 0, 0, 0);
195 ns += 0;
196 AKLOGI("Returning %d words", suggestedWordsCount);
197 /// Print the returned words
198 for (int j = 0; j < suggestedWordsCount; ++j) {
199 int *w = outWords + j * MAX_WORD_LENGTH;
200 char s[MAX_WORD_LENGTH];
201 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
202 (void)s; // To suppress compiler warning
203 AKLOGI("%s %i", s, frequencies[j]);
204 }
205 }
206 PROF_END(20);
207 PROF_CLOSE;
208 return suggestedWordsCount;
209 }
210
getWordSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * inputCodePoints,const int inputSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,Correction * correction,WordsPriorityQueuePool * queuePool) const211 void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
212 const int *ycoordinates, const int *inputCodePoints, const int inputSize,
213 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
214 const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool)
215 const {
216 PROF_OPEN;
217 PROF_START(0);
218 PROF_END(0);
219
220 PROF_START(1);
221 getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, bigramMap,
222 bigramFilter, useFullEditDistance, inputSize, correction, queuePool);
223 PROF_END(1);
224
225 PROF_START(2);
226 // Note: This line is intentionally left blank
227 PROF_END(2);
228
229 PROF_START(3);
230 // Note: This line is intentionally left blank
231 PROF_END(3);
232
233 PROF_START(4);
234 bool hasAutoCorrectionCandidate = false;
235 WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
236 if (masterQueue->size() > 0) {
237 float nsForMaster = masterQueue->getHighestNormalizedScore(
238 correction->getPrimaryInputWord(), inputSize, 0, 0, 0);
239 hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD);
240 }
241 PROF_END(4);
242
243 PROF_START(5);
244 // Multiple word suggestions
245 if (SUGGEST_MULTIPLE_WORDS
246 && inputSize >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
247 getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints,
248 useFullEditDistance, inputSize, correction, queuePool,
249 hasAutoCorrectionCandidate);
250 }
251 PROF_END(5);
252
253 PROF_START(6);
254 // Note: This line is intentionally left blank
255 PROF_END(6);
256
257 if (DEBUG_DICT) {
258 queuePool->dumpSubQueue1TopSuggestions();
259 for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
260 WordsPriorityQueue *queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i);
261 if (queue->size() > 0) {
262 WordsPriorityQueue::SuggestedWord *sw = queue->top();
263 const int score = sw->mScore;
264 const int *word = sw->mWord;
265 const int wordLength = sw->mWordLength;
266 float ns = Correction::RankingAlgorithm::calcNormalizedScore(
267 correction->getPrimaryInputWord(), i, word, wordLength, score);
268 ns += 0;
269 AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
270 (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD));
271 DUMP_WORD(correction->getPrimaryInputWord(), i);
272 DUMP_WORD(word, wordLength);
273 }
274 }
275 }
276 }
277
initSuggestions(ProximityInfo * proximityInfo,const int * xCoordinates,const int * yCoordinates,const int * codes,const int inputSize,Correction * correction) const278 void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates,
279 const int *yCoordinates, const int *codes, const int inputSize,
280 Correction *correction) const {
281 if (DEBUG_DICT) {
282 AKLOGI("initSuggest");
283 DUMP_WORD(codes, inputSize);
284 }
285 correction->initInputParams(proximityInfo, codes, inputSize, xCoordinates, yCoordinates);
286 const int maxDepth = min(inputSize * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
287 correction->initCorrection(proximityInfo, inputSize, maxDepth);
288 }
289
getOneWordSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,const bool useFullEditDistance,const int inputSize,Correction * correction,WordsPriorityQueuePool * queuePool) const290 void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo,
291 const int *xcoordinates, const int *ycoordinates, const int *codes,
292 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
293 const bool useFullEditDistance, const int inputSize,
294 Correction *correction, WordsPriorityQueuePool *queuePool) const {
295 initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputSize, correction);
296 getSuggestionCandidates(useFullEditDistance, inputSize, bigramMap, bigramFilter, correction,
297 queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX);
298 }
299
getSuggestionCandidates(const bool useFullEditDistance,const int inputSize,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,Correction * correction,WordsPriorityQueuePool * queuePool,const bool doAutoCompletion,const int maxErrors,const int currentWordIndex) const300 void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
301 const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
302 Correction *correction, WordsPriorityQueuePool *queuePool,
303 const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) const {
304 uint8_t totalTraverseCount = correction->pushAndGetTotalTraverseCount();
305 if (DEBUG_DICT) {
306 AKLOGI("Traverse count %d", totalTraverseCount);
307 }
308 if (totalTraverseCount > MULTIPLE_WORDS_SUGGESTION_MAX_TOTAL_TRAVERSE_COUNT) {
309 if (DEBUG_DICT) {
310 AKLOGI("Abort traversing %d", totalTraverseCount);
311 }
312 return;
313 }
314 // TODO: Remove setCorrectionParams
315 correction->setCorrectionParams(0, 0, 0,
316 -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
317 doAutoCompletion, maxErrors);
318 int rootPosition = ROOT_POS;
319 // Get the number of children of root, then increment the position
320 int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition);
321 int outputIndex = 0;
322
323 correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0));
324
325 // Depth first search
326 while (outputIndex >= 0) {
327 if (correction->initProcessState(outputIndex)) {
328 int siblingPos = correction->getTreeSiblingPos(outputIndex);
329 int firstChildPos;
330
331 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
332 bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos,
333 queuePool, currentWordIndex);
334 // Update next sibling pos
335 correction->setTreeSiblingPos(outputIndex, siblingPos);
336
337 if (needsToTraverseChildrenNodes) {
338 // Goes to child node
339 outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos);
340 }
341 } else {
342 // Goes to parent sibling node
343 outputIndex = correction->getTreeParentIndex(outputIndex);
344 }
345 }
346 }
347
onTerminal(const int probability,const TerminalAttributes & terminalAttributes,Correction * correction,WordsPriorityQueuePool * queuePool,const bool addToMasterQueue,const int currentWordIndex) const348 void UnigramDictionary::onTerminal(const int probability,
349 const TerminalAttributes &terminalAttributes, Correction *correction,
350 WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
351 const int currentWordIndex) const {
352 const int inputIndex = correction->getInputIndex();
353 const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT;
354
355 int wordLength;
356 int *wordPointer;
357
358 if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) {
359 WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
360 const int finalProbability =
361 correction->getFinalProbability(probability, &wordPointer, &wordLength);
362
363 if (0 != finalProbability && !terminalAttributes.isBlacklistedOrNotAWord()) {
364 // If the probability is 0, we don't want to add this word. However we still
365 // want to add its shortcuts (including a possible whitelist entry) if any.
366 // Furthermore, if this is not a word (shortcut only for example) or a blacklisted
367 // entry then we never want to suggest this.
368 addWord(wordPointer, wordLength, finalProbability, masterQueue,
369 Dictionary::KIND_CORRECTION);
370 }
371
372 const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0;
373 // Please note that the shortcut candidates will be added to the master queue only.
374 TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator();
375 while (iterator.hasNextShortcutTarget()) {
376 // TODO: addWord only supports weak ordering, meaning we have no means
377 // to control the order of the shortcuts relative to one another or to the word.
378 // We need to either modulate the probability of each shortcut according
379 // to its own shortcut probability or to make the queue
380 // so that the insert order is protected inside the queue for words
381 // with the same score. For the moment we use -1 to make sure the shortcut will
382 // never be in front of the word.
383 int shortcutTarget[MAX_WORD_LENGTH];
384 int shortcutFrequency;
385 const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
386 MAX_WORD_LENGTH, shortcutTarget, &shortcutFrequency);
387 int shortcutScore;
388 int kind;
389 if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_PROBABILITY
390 && correction->sameAsTyped()) {
391 shortcutScore = S_INT_MAX;
392 kind = Dictionary::KIND_WHITELIST;
393 } else {
394 shortcutScore = shortcutProbability;
395 kind = Dictionary::KIND_CORRECTION;
396 }
397 addWord(shortcutTarget, shortcutTargetStringLength, shortcutScore,
398 masterQueue, kind);
399 }
400 }
401
402 // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH
403 // or more length.
404 if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) {
405 WordsPriorityQueue *subQueue;
406 subQueue = queuePool->getSubQueue(currentWordIndex, inputIndex);
407 if (!subQueue) {
408 return;
409 }
410 const int finalProbability = correction->getFinalProbabilityForSubQueue(
411 probability, &wordPointer, &wordLength, inputIndex);
412 addWord(wordPointer, wordLength, finalProbability, subQueue, Dictionary::KIND_CORRECTION);
413 }
414 }
415
getSubStringSuggestion(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const bool useFullEditDistance,Correction * correction,WordsPriorityQueuePool * queuePool,const int inputSize,const bool hasAutoCorrectionCandidate,const int currentWordIndex,const int inputWordStartPos,const int inputWordLength,const int outputWordStartPos,const bool isSpaceProximity,int * freqArray,int * wordLengthArray,int * outputWord,int * outputWordLength) const416 int UnigramDictionary::getSubStringSuggestion(
417 ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
418 const int *codes, const bool useFullEditDistance, Correction *correction,
419 WordsPriorityQueuePool *queuePool, const int inputSize,
420 const bool hasAutoCorrectionCandidate, const int currentWordIndex,
421 const int inputWordStartPos, const int inputWordLength,
422 const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
423 int *wordLengthArray, int *outputWord, int *outputWordLength) const {
424 if (inputWordLength > MULTIPLE_WORDS_SUGGESTION_MAX_WORD_LENGTH) {
425 return FLAG_MULTIPLE_SUGGEST_ABORT;
426 }
427
428 /////////////////////////////////////////////
429 // safety net for multiple word suggestion //
430 // TODO: Remove this safety net //
431 /////////////////////////////////////////////
432 int smallWordCount = 0;
433 int singleLetterWordCount = 0;
434 if (inputWordLength == 1) {
435 ++singleLetterWordCount;
436 }
437 if (inputWordLength <= 2) {
438 // small word == single letter or 2-letter word
439 ++smallWordCount;
440 }
441 for (int i = 0; i < currentWordIndex; ++i) {
442 const int length = wordLengthArray[i];
443 if (length == 1) {
444 ++singleLetterWordCount;
445 // Safety net to avoid suggesting sequential single letter words
446 if (i < (currentWordIndex - 1)) {
447 if (wordLengthArray[i + 1] == 1) {
448 return FLAG_MULTIPLE_SUGGEST_ABORT;
449 }
450 } else if (inputWordLength == 1) {
451 return FLAG_MULTIPLE_SUGGEST_ABORT;
452 }
453 }
454 if (length <= 2) {
455 ++smallWordCount;
456 }
457 // Safety net to avoid suggesting multiple words with many (4 or more, for now) small words
458 if (singleLetterWordCount >= 3 || smallWordCount >= 4) {
459 return FLAG_MULTIPLE_SUGGEST_ABORT;
460 }
461 }
462 //////////////////////////////////////////////
463 // TODO: Remove the safety net above //
464 //////////////////////////////////////////////
465
466 int *tempOutputWord = 0;
467 int nextWordLength = 0;
468 // TODO: Optimize init suggestion
469 initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
470 inputSize, correction);
471
472 int word[MAX_WORD_LENGTH];
473 int freq = getMostProbableWordLike(
474 inputWordStartPos, inputWordLength, correction, word);
475 if (freq > 0) {
476 nextWordLength = inputWordLength;
477 tempOutputWord = word;
478 } else if (!hasAutoCorrectionCandidate) {
479 if (inputWordStartPos > 0) {
480 const int offset = inputWordStartPos;
481 initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset],
482 codes + offset, inputWordLength, correction);
483 queuePool->clearSubQueue(currentWordIndex);
484 // TODO: pass the bigram list for substring suggestion
485 getSuggestionCandidates(useFullEditDistance, inputWordLength,
486 0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool,
487 false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
488 if (DEBUG_DICT) {
489 if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
490 AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
491 for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
492 queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord();
493 }
494 }
495 }
496 }
497 WordsPriorityQueue *queue = queuePool->getSubQueue(currentWordIndex, inputWordLength);
498 // TODO: Return the correct value depending on doAutoCompletion
499 if (!queue || queue->size() <= 0) {
500 return FLAG_MULTIPLE_SUGGEST_ABORT;
501 }
502 int score = 0;
503 const float ns = queue->getHighestNormalizedScore(
504 correction->getPrimaryInputWord(), inputWordLength,
505 &tempOutputWord, &score, &nextWordLength);
506 if (DEBUG_DICT) {
507 AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score);
508 }
509 // Two words correction won't be done if the score of the first word doesn't exceed the
510 // threshold.
511 if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
512 || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
513 return FLAG_MULTIPLE_SUGGEST_SKIP;
514 }
515 freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
516 }
517 if (DEBUG_DICT) {
518 AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)",
519 currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos,
520 (currentWordIndex > 0) ? wordLengthArray[0] : 0);
521 }
522 if (freq <= 0 || nextWordLength <= 0
523 || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) {
524 return FLAG_MULTIPLE_SUGGEST_SKIP;
525 }
526 for (int i = 0; i < nextWordLength; ++i) {
527 outputWord[outputWordStartPos + i] = tempOutputWord[i];
528 }
529
530 // Put output values
531 freqArray[currentWordIndex] = freq;
532 // TODO: put output length instead of input length
533 wordLengthArray[currentWordIndex] = inputWordLength;
534 const int tempOutputWordLength = outputWordStartPos + nextWordLength;
535 if (outputWordLength) {
536 *outputWordLength = tempOutputWordLength;
537 }
538
539 if ((inputWordStartPos + inputWordLength) < inputSize) {
540 if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
541 return FLAG_MULTIPLE_SUGGEST_SKIP;
542 }
543 outputWord[tempOutputWordLength] = KEYCODE_SPACE;
544 if (outputWordLength) {
545 ++*outputWordLength;
546 }
547 } else if (currentWordIndex >= 1) {
548 // TODO: Handle 3 or more words
549 const int pairFreq = correction->getFreqForSplitMultipleWords(
550 freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
551 if (DEBUG_DICT) {
552 DUMP_WORD(outputWord, tempOutputWordLength);
553 for (int i = 0; i < currentWordIndex + 1; ++i) {
554 AKLOGI("Split %d,%d words: freq = %d, length = %d", i, currentWordIndex + 1,
555 freqArray[i], wordLengthArray[i]);
556 }
557 AKLOGI("Split two words: freq = %d, length = %d, %d, isSpace ? %d", pairFreq,
558 inputSize, tempOutputWordLength, isSpaceProximity);
559 }
560 addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue(),
561 Dictionary::KIND_CORRECTION);
562 }
563 return FLAG_MULTIPLE_SUGGEST_CONTINUE;
564 }
565
getMultiWordsSuggestionRec(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const bool useFullEditDistance,const int inputSize,Correction * correction,WordsPriorityQueuePool * queuePool,const bool hasAutoCorrectionCandidate,const int startInputPos,const int startWordIndex,const int outputWordLength,int * freqArray,int * wordLengthArray,int * outputWord) const566 void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
567 const int *xcoordinates, const int *ycoordinates, const int *codes,
568 const bool useFullEditDistance, const int inputSize, Correction *correction,
569 WordsPriorityQueuePool *queuePool, const bool hasAutoCorrectionCandidate,
570 const int startInputPos, const int startWordIndex, const int outputWordLength,
571 int *freqArray, int *wordLengthArray, int *outputWord) const {
572 if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) {
573 // Return if the last word index
574 return;
575 }
576 if (startWordIndex >= 1
577 && (hasAutoCorrectionCandidate
578 || inputSize < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
579 // Do not suggest 3+ words if already has auto correction candidate
580 return;
581 }
582 for (int i = startInputPos + 1; i < inputSize; ++i) {
583 if (DEBUG_CORRECTION_FREQ) {
584 AKLOGI("Multi words(%d), start in %d sep %d start out %d",
585 startWordIndex, startInputPos, i, outputWordLength);
586 DUMP_WORD(outputWord, outputWordLength);
587 }
588 int tempOutputWordLength = 0;
589 // Current word
590 int inputWordStartPos = startInputPos;
591 int inputWordLength = i - startInputPos;
592 const int suggestionFlag = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates,
593 codes, useFullEditDistance, correction, queuePool, inputSize,
594 hasAutoCorrectionCandidate, startWordIndex, inputWordStartPos, inputWordLength,
595 outputWordLength, true /* not used */, freqArray, wordLengthArray, outputWord,
596 &tempOutputWordLength);
597 if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_ABORT) {
598 // TODO: break here
599 continue;
600 } else if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_SKIP) {
601 continue;
602 }
603
604 if (DEBUG_CORRECTION_FREQ) {
605 AKLOGI("Do missing space correction");
606 }
607 // Next word
608 // Missing space
609 inputWordStartPos = i;
610 inputWordLength = inputSize - i;
611 if (getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
612 useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate,
613 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
614 false /* missing space */, freqArray, wordLengthArray, outputWord, 0)
615 != FLAG_MULTIPLE_SUGGEST_CONTINUE) {
616 getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
617 useFullEditDistance, inputSize, correction, queuePool,
618 hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
619 tempOutputWordLength, freqArray, wordLengthArray, outputWord);
620 }
621
622 // Mistyped space
623 ++inputWordStartPos;
624 --inputWordLength;
625
626 if (inputWordLength <= 0) {
627 continue;
628 }
629
630 const int x = xcoordinates[inputWordStartPos - 1];
631 const int y = ycoordinates[inputWordStartPos - 1];
632 if (!proximityInfo->hasSpaceProximity(x, y)) {
633 continue;
634 }
635
636 if (DEBUG_CORRECTION_FREQ) {
637 AKLOGI("Do mistyped space correction");
638 }
639 getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
640 useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate,
641 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
642 true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
643 }
644 }
645
getSplitMultipleWordsSuggestions(ProximityInfo * proximityInfo,const int * xcoordinates,const int * ycoordinates,const int * codes,const bool useFullEditDistance,const int inputSize,Correction * correction,WordsPriorityQueuePool * queuePool,const bool hasAutoCorrectionCandidate) const646 void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
647 const int *xcoordinates, const int *ycoordinates, const int *codes,
648 const bool useFullEditDistance, const int inputSize,
649 Correction *correction, WordsPriorityQueuePool *queuePool,
650 const bool hasAutoCorrectionCandidate) const {
651 if (inputSize >= MAX_WORD_LENGTH) return;
652 if (DEBUG_DICT) {
653 AKLOGI("--- Suggest multiple words");
654 }
655
656 // Allocating fixed length array on stack
657 int outputWord[MAX_WORD_LENGTH];
658 int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
659 int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
660 const int outputWordLength = 0;
661 const int startInputPos = 0;
662 const int startWordIndex = 0;
663 getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
664 useFullEditDistance, inputSize, correction, queuePool, hasAutoCorrectionCandidate,
665 startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray,
666 outputWord);
667 }
668
669 // Wrapper for getMostProbableWordLikeInner, which matches it to the previous
670 // interface.
getMostProbableWordLike(const int startInputIndex,const int inputSize,Correction * correction,int * word) const671 int UnigramDictionary::getMostProbableWordLike(const int startInputIndex, const int inputSize,
672 Correction *correction, int *word) const {
673 int inWord[inputSize];
674 for (int i = 0; i < inputSize; ++i) {
675 inWord[i] = correction->getPrimaryCodePointAt(startInputIndex + i);
676 }
677 return getMostProbableWordLikeInner(inWord, inputSize, word);
678 }
679
680 // This function will take the position of a character array within a CharGroup,
681 // and check it actually like-matches the word in inWord starting at startInputIndex,
682 // that is, it matches it with case and accents squashed.
683 // The function returns true if there was a full match, false otherwise.
684 // The function will copy on-the-fly the characters in the CharGroup to outNewWord.
685 // It will also place the end position of the array in outPos; in outInputIndex,
686 // it will place the index of the first char AFTER the match if there was a match,
687 // and the initial position if there was not. It makes sense because if there was
688 // a match we want to continue searching, but if there was not, we want to go to
689 // the next CharGroup.
690 // In and out parameters may point to the same location. This function takes care
691 // not to use any input parameters after it wrote into its outputs.
testCharGroupForContinuedLikeness(const uint8_t flags,const uint8_t * const root,const int startPos,const int * const inWord,const int startInputIndex,const int inputSize,int * outNewWord,int * outInputIndex,int * outPos)692 static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
693 const uint8_t *const root, const int startPos, const int *const inWord,
694 const int startInputIndex, const int inputSize, int *outNewWord, int *outInputIndex,
695 int *outPos) {
696 const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
697 int pos = startPos;
698 int codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
699 int baseChar = toBaseLowerCase(codePoint);
700 const int wChar = toBaseLowerCase(inWord[startInputIndex]);
701
702 if (baseChar != wChar) {
703 *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
704 *outInputIndex = startInputIndex;
705 return false;
706 }
707 int inputIndex = startInputIndex;
708 outNewWord[inputIndex] = codePoint;
709 if (hasMultipleChars) {
710 codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
711 while (NOT_A_CODE_POINT != codePoint) {
712 baseChar = toBaseLowerCase(codePoint);
713 if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
714 *outPos = BinaryFormat::skipOtherCharacters(root, pos);
715 *outInputIndex = startInputIndex;
716 return false;
717 }
718 outNewWord[inputIndex] = codePoint;
719 codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
720 }
721 }
722 *outInputIndex = inputIndex + 1;
723 *outPos = pos;
724 return true;
725 }
726
727 // This function is invoked when a word like the word searched for is found.
728 // It will compare the probability to the max probability, and if greater, will
729 // copy the word into the output buffer. In output value maxFreq, it will
730 // write the new maximum probability if it changed.
onTerminalWordLike(const int freq,int * newWord,const int length,int * outWord,int * maxFreq)731 static inline void onTerminalWordLike(const int freq, int *newWord, const int length, int *outWord,
732 int *maxFreq) {
733 if (freq > *maxFreq) {
734 for (int q = 0; q < length; ++q) {
735 outWord[q] = newWord[q];
736 }
737 outWord[length] = 0;
738 *maxFreq = freq;
739 }
740 }
741
742 // Will find the highest probability of the words like the one passed as an argument,
743 // that is, everything that only differs by case/accents.
getMostProbableWordLikeInner(const int * const inWord,const int inputSize,int * outWord) const744 int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, const int inputSize,
745 int *outWord) const {
746 int newWord[MAX_WORD_LENGTH];
747 int depth = 0;
748 int maxFreq = -1;
749 const uint8_t *const root = DICT_ROOT;
750 int stackChildCount[MAX_WORD_LENGTH];
751 int stackInputIndex[MAX_WORD_LENGTH];
752 int stackSiblingPos[MAX_WORD_LENGTH];
753
754 int startPos = 0;
755 stackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos);
756 stackInputIndex[0] = 0;
757 stackSiblingPos[0] = startPos;
758 while (depth >= 0) {
759 const int charGroupCount = stackChildCount[depth];
760 int pos = stackSiblingPos[depth];
761 for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
762 int inputIndex = stackInputIndex[depth];
763 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
764 // Test whether all chars in this group match with the word we are searching for. If so,
765 // we want to traverse its children (or if the inputSize match, evaluate its
766 // probability). Note that this function will output the position regardless, but will
767 // only write into inputIndex if there is a match.
768 const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
769 inputIndex, inputSize, newWord, &inputIndex, &pos);
770 if (isAlike && (!(BinaryFormat::FLAG_IS_NOT_A_WORD & flags))
771 && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) {
772 const int probability =
773 BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
774 onTerminalWordLike(probability, newWord, inputIndex, outWord, &maxFreq);
775 }
776 pos = BinaryFormat::skipProbability(flags, pos);
777 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
778 const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
779 // If we had a match and the word has children, we want to traverse them. We don't have
780 // to traverse words longer than the one we are searching for, since they will not match
781 // anyway, so don't traverse unless inputIndex < inputSize.
782 if (isAlike && (-1 != childrenNodePos) && (inputIndex < inputSize)) {
783 // Save position for this depth, to get back to this once children are done
784 stackChildCount[depth] = charGroupIndex;
785 stackSiblingPos[depth] = siblingPos;
786 // Prepare stack values for next depth
787 ++depth;
788 int childrenPos = childrenNodePos;
789 stackChildCount[depth] =
790 BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
791 stackSiblingPos[depth] = childrenPos;
792 stackInputIndex[depth] = inputIndex;
793 pos = childrenPos;
794 // Go to the next depth level.
795 ++depth;
796 break;
797 } else {
798 // No match, or no children, or word too long to ever match: go the next sibling.
799 pos = siblingPos;
800 }
801 }
802 --depth;
803 }
804 return maxFreq;
805 }
806
getProbability(const int * const inWord,const int length) const807 int UnigramDictionary::getProbability(const int *const inWord, const int length) const {
808 const uint8_t *const root = DICT_ROOT;
809 int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
810 false /* forceLowerCaseSearch */);
811 if (NOT_VALID_WORD == pos) {
812 return NOT_A_PROBABILITY;
813 }
814 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
815 if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) {
816 // If this is not a word, or if it's a blacklisted entry, it should behave as
817 // having no probability outside of the suggestion process (where it should be used
818 // for shortcuts).
819 return NOT_A_PROBABILITY;
820 }
821 const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
822 if (hasMultipleChars) {
823 pos = BinaryFormat::skipOtherCharacters(root, pos);
824 } else {
825 BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
826 }
827 const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
828 return unigramProbability;
829 }
830
831 // TODO: remove this function.
getBigramPosition(int pos,int * word,int offset,int length) const832 int UnigramDictionary::getBigramPosition(int pos, int *word, int offset, int length) const {
833 return -1;
834 }
835
836 // ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
837 // If the return value is false, then the caller should read in the output "nextSiblingPosition"
838 // to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
839 // It is worthy to note that when false is returned, the output values other than
840 // nextSiblingPosition are undefined.
841 // If the return value is true, then the caller must proceed to traverse the children of this
842 // node. processCurrentNode will output the information about the children: their count in
843 // newCount, their position in newChildrenPosition, the traverseAllNodes flag in
844 // newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
845 // diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
846 // newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
847 // there aren't any more nodes at this level, it merely returns the address of the first byte after
848 // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
849 // given level, as output into newCount when traversing this level's parent.
processCurrentNode(const int initialPos,const std::map<int,int> * bigramMap,const uint8_t * bigramFilter,Correction * correction,int * newCount,int * newChildrenPosition,int * nextSiblingPosition,WordsPriorityQueuePool * queuePool,const int currentWordIndex) const850 bool UnigramDictionary::processCurrentNode(const int initialPos,
851 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction,
852 int *newCount, int *newChildrenPosition, int *nextSiblingPosition,
853 WordsPriorityQueuePool *queuePool, const int currentWordIndex) const {
854 if (DEBUG_DICT) {
855 correction->checkState();
856 }
857 int pos = initialPos;
858
859 // Flags contain the following information:
860 // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
861 // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
862 // is on the specified number of bytes.
863 // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
864 // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
865 // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
866 // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
867 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
868 const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
869 const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
870
871 bool needsToInvokeOnTerminal = false;
872
873 // This gets only ONE character from the stream. Next there will be:
874 // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
875 // else if FLAG_IS_TERMINAL: the probability
876 // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
877 // Note that you can't have a node that both is not a terminal and has no children.
878 int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
879 ASSERT(NOT_A_CODE_POINT != c);
880
881 // We are going to loop through each character and make it look like it's a different
882 // node each time. To do that, we will process characters in this node in order until
883 // we find the character terminator. This is signalled by getCodePoint* returning
884 // NOT_A_CODE_POINT.
885 // As a special case, if there is only one character in this node, we must not read the
886 // next bytes so we will simulate the NOT_A_CODE_POINT return by testing the flags.
887 // This way, each loop run will look like a "virtual node".
888 do {
889 // We prefetch the next char. If 'c' is the last char of this node, we will have
890 // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node
891 // should behave as a terminal or not and whether we have children.
892 const int nextc = hasMultipleChars
893 ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT;
894 const bool isLastChar = (NOT_A_CODE_POINT == nextc);
895 // If there are more chars in this nodes, then this virtual node is not a terminal.
896 // If we are on the last char, this virtual node is a terminal if this node is.
897 const bool isTerminal = isLastChar && isTerminalNode;
898
899 Correction::CorrectionType stateType = correction->processCharAndCalcState(
900 c, isTerminal);
901 if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL
902 || stateType == Correction::ON_TERMINAL) {
903 needsToInvokeOnTerminal = true;
904 } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) {
905 // We found that this is an unrelated character, so we should give up traversing
906 // this node and its children entirely.
907 // However we may not be on the last virtual node yet so we skip the remaining
908 // characters in this node, the probability if it's there, read the next sibling
909 // position to output it, then return false.
910 // We don't have to output other values because we return false, as in
911 // "don't traverse children".
912 if (!isLastChar) {
913 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
914 }
915 pos = BinaryFormat::skipProbability(flags, pos);
916 *nextSiblingPosition =
917 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
918 return false;
919 }
920
921 // Prepare for the next character. Promote the prefetched char to current char - the loop
922 // will take care of prefetching the next. If we finally found our last char, nextc will
923 // contain NOT_A_CODE_POINT.
924 c = nextc;
925 } while (NOT_A_CODE_POINT != c);
926
927 if (isTerminalNode) {
928 // The probability should be here, because we come here only if this is actually
929 // a terminal node, and we are on its last char.
930 const int unigramProbability =
931 BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos);
932 const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos);
933 const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
934 TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
935 // bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
936 // bigramFilter is a bloom filter of said frequencies for even faster rejection.
937 const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
938 unigramProbability);
939 onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
940 currentWordIndex);
941
942 // If there are more chars in this node, then this virtual node has children.
943 // If we are on the last char, this virtual node has children if this node has.
944 const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
945
946 // This character matched the typed character (enough to traverse the node at least)
947 // so we just evaluated it. Now we should evaluate this virtual node's children - that
948 // is, if it has any. If it has no children, we're done here - so we skip the end of
949 // the node, output the siblings position, and return false "don't traverse children".
950 // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
951 // remaining char in this group for there can't be any.
952 if (!hasChildren) {
953 pos = BinaryFormat::skipProbability(flags, pos);
954 *nextSiblingPosition =
955 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
956 return false;
957 }
958
959 // Optimization: Prune out words that are too long compared to how much was typed.
960 if (correction->needsToPrune()) {
961 pos = BinaryFormat::skipProbability(flags, pos);
962 *nextSiblingPosition =
963 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
964 if (DEBUG_DICT_FULL) {
965 AKLOGI("Traversing was pruned.");
966 }
967 return false;
968 }
969 }
970
971 // Now we finished processing this node, and we want to traverse children. If there are no
972 // children, we can't come here.
973 ASSERT(BinaryFormat::hasChildrenInFlags(flags));
974
975 // If this node was a terminal it still has the probability under the pointer (it may have been
976 // read, but not skipped - see readProbabilityWithoutMovingPointer).
977 // Next come the children position, then possibly attributes (attributes are bigrams only for
978 // now, maybe something related to shortcuts in the future).
979 // Once this is read, we still need to output the number of nodes in the immediate children of
980 // this node, so we read and output it before returning true, as in "please traverse children".
981 pos = BinaryFormat::skipProbability(flags, pos);
982 int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
983 *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
984 *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
985 *newChildrenPosition = childrenPos;
986 return true;
987 }
988 } // namespace latinime
989