1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "LatinIME: correction.cpp"
18
19 #include <cmath>
20
21 #include "char_utils.h"
22 #include "correction.h"
23 #include "defines.h"
24 #include "proximity_info_state.h"
25 #include "suggest_utils.h"
26 #include "suggest/policyimpl/utils/edit_distance.h"
27 #include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h"
28
29 namespace latinime {
30
31 class ProximityInfo;
32
33 /////////////////////////////
34 // edit distance funcitons //
35 /////////////////////////////
36
initEditDistance(int * editDistanceTable)37 inline static void initEditDistance(int *editDistanceTable) {
38 for (int i = 0; i <= MAX_WORD_LENGTH; ++i) {
39 editDistanceTable[i] = i;
40 }
41 }
42
dumpEditDistance10ForDebug(int * editDistanceTable,const int editDistanceTableWidth,const int outputLength)43 inline static void dumpEditDistance10ForDebug(int *editDistanceTable,
44 const int editDistanceTableWidth, const int outputLength) {
45 if (DEBUG_DICT) {
46 AKLOGI("EditDistanceTable");
47 for (int i = 0; i <= 10; ++i) {
48 int c[11];
49 for (int j = 0; j <= 10; ++j) {
50 if (j < editDistanceTableWidth + 1 && i < outputLength + 1) {
51 c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j];
52 } else {
53 c[j] = -1;
54 }
55 }
56 AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]",
57 c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]);
58 (void)c; // To suppress compiler warning
59 }
60 }
61 }
62
getCurrentEditDistance(int * editDistanceTable,const int editDistanceTableWidth,const int outputLength,const int inputSize)63 inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth,
64 const int outputLength, const int inputSize) {
65 if (DEBUG_EDIT_DISTANCE) {
66 AKLOGI("getCurrentEditDistance %d, %d", inputSize, outputLength);
67 }
68 return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputSize];
69 }
70
71 ////////////////
72 // Correction //
73 ////////////////
74
resetCorrection()75 void Correction::resetCorrection() {
76 mTotalTraverseCount = 0;
77 }
78
initCorrection(const ProximityInfo * pi,const int inputSize,const int maxDepth)79 void Correction::initCorrection(const ProximityInfo *pi, const int inputSize, const int maxDepth) {
80 mProximityInfo = pi;
81 mInputSize = inputSize;
82 mMaxDepth = maxDepth;
83 mMaxEditDistance = mInputSize < 5 ? 2 : mInputSize / 2;
84 // TODO: This is not supposed to be required. Check what's going wrong with
85 // editDistance[0 ~ MAX_WORD_LENGTH]
86 initEditDistance(mEditDistanceTable);
87 }
88
initCorrectionState(const int rootPos,const int childCount,const bool traverseAll)89 void Correction::initCorrectionState(
90 const int rootPos, const int childCount, const bool traverseAll) {
91 latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
92 // TODO: remove
93 mCorrectionStates[0].mTransposedPos = mTransposedPos;
94 mCorrectionStates[0].mExcessivePos = mExcessivePos;
95 mCorrectionStates[0].mSkipPos = mSkipPos;
96 }
97
setCorrectionParams(const int skipPos,const int excessivePos,const int transposedPos,const int spaceProximityPos,const int missingSpacePos,const bool useFullEditDistance,const bool doAutoCompletion,const int maxErrors)98 void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
99 const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
100 const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
101 // TODO: remove
102 mTransposedPos = transposedPos;
103 mExcessivePos = excessivePos;
104 mSkipPos = skipPos;
105 // TODO: remove
106 mCorrectionStates[0].mTransposedPos = transposedPos;
107 mCorrectionStates[0].mExcessivePos = excessivePos;
108 mCorrectionStates[0].mSkipPos = skipPos;
109
110 mSpaceProximityPos = spaceProximityPos;
111 mMissingSpacePos = missingSpacePos;
112 mUseFullEditDistance = useFullEditDistance;
113 mDoAutoCompletion = doAutoCompletion;
114 mMaxErrors = maxErrors;
115 }
116
checkState() const117 void Correction::checkState() const {
118 if (DEBUG_DICT) {
119 int inputCount = 0;
120 if (mSkipPos >= 0) ++inputCount;
121 if (mExcessivePos >= 0) ++inputCount;
122 if (mTransposedPos >= 0) ++inputCount;
123 }
124 }
125
sameAsTyped() const126 bool Correction::sameAsTyped() const {
127 return mProximityInfoState.sameAsTyped(mWord, mOutputIndex);
128 }
129
getFreqForSplitMultipleWords(const int * freqArray,const int * wordLengthArray,const int wordCount,const bool isSpaceProximity,const int * word) const130 int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
131 const int wordCount, const bool isSpaceProximity, const int *word) const {
132 return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
133 wordCount, this, isSpaceProximity, word);
134 }
135
getFinalProbability(const int probability,int ** word,int * wordLength)136 int Correction::getFinalProbability(const int probability, int **word, int *wordLength) {
137 return getFinalProbabilityInternal(probability, word, wordLength, mInputSize);
138 }
139
getFinalProbabilityForSubQueue(const int probability,int ** word,int * wordLength,const int inputSize)140 int Correction::getFinalProbabilityForSubQueue(const int probability, int **word, int *wordLength,
141 const int inputSize) {
142 return getFinalProbabilityInternal(probability, word, wordLength, inputSize);
143 }
144
initProcessState(const int outputIndex)145 bool Correction::initProcessState(const int outputIndex) {
146 if (mCorrectionStates[outputIndex].mChildCount <= 0) {
147 return false;
148 }
149 mOutputIndex = outputIndex;
150 --(mCorrectionStates[outputIndex].mChildCount);
151 mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
152 mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
153
154 mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount;
155 mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
156 mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount;
157 mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount;
158 mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
159 mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded;
160
161 mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos;
162 mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos;
163 mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
164
165 mMatching = false;
166 mProximityMatching = false;
167 mAdditionalProximityMatching = false;
168 mTransposing = false;
169 mExceeding = false;
170 mSkipping = false;
171
172 return true;
173 }
174
goDownTree(const int parentIndex,const int childCount,const int firstChildPos)175 int Correction::goDownTree(const int parentIndex, const int childCount, const int firstChildPos) {
176 mCorrectionStates[mOutputIndex].mParentIndex = parentIndex;
177 mCorrectionStates[mOutputIndex].mChildCount = childCount;
178 mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos;
179 return mOutputIndex;
180 }
181
182 // TODO: remove
getInputIndex() const183 int Correction::getInputIndex() const {
184 return mInputIndex;
185 }
186
needsToPrune() const187 bool Correction::needsToPrune() const {
188 // TODO: use edit distance here
189 return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
190 // Allow one char longer word for missing character
191 || (!mDoAutoCompletion && (mOutputIndex > mInputSize));
192 }
193
isEquivalentChar(ProximityType type)194 inline static bool isEquivalentChar(ProximityType type) {
195 return type == MATCH_CHAR;
196 }
197
isProximityCharOrEquivalentChar(ProximityType type)198 inline static bool isProximityCharOrEquivalentChar(ProximityType type) {
199 return type == MATCH_CHAR || type == PROXIMITY_CHAR;
200 }
201
processCharAndCalcState(const int c,const bool isTerminal)202 Correction::CorrectionType Correction::processCharAndCalcState(const int c, const bool isTerminal) {
203 const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
204 if (correctionCount > mMaxErrors) {
205 return processUnrelatedCorrectionType();
206 }
207
208 // TODO: Change the limit if we'll allow two or more corrections
209 const bool noCorrectionsHappenedSoFar = correctionCount == 0;
210 const bool canTryCorrection = noCorrectionsHappenedSoFar;
211 int proximityIndex = 0;
212 mDistances[mOutputIndex] = NOT_A_DISTANCE;
213
214 // Skip checking this node
215 if (mNeedsToTraverseAllNodes || isSingleQuote(c)) {
216 bool incremented = false;
217 if (mLastCharExceeded && mInputIndex == mInputSize - 1) {
218 // TODO: Do not check the proximity if EditDistance exceeds the threshold
219 const ProximityType matchId = mProximityInfoState.getProximityType(
220 mInputIndex, c, true, &proximityIndex);
221 if (isEquivalentChar(matchId)) {
222 mLastCharExceeded = false;
223 --mExcessiveCount;
224 mDistances[mOutputIndex] =
225 mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0);
226 } else if (matchId == PROXIMITY_CHAR) {
227 mLastCharExceeded = false;
228 --mExcessiveCount;
229 ++mProximityCount;
230 mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(
231 mInputIndex, proximityIndex);
232 }
233 if (!isSingleQuote(c)) {
234 incrementInputIndex();
235 incremented = true;
236 }
237 }
238 return processSkipChar(c, isTerminal, incremented);
239 }
240
241 // Check possible corrections.
242 if (mExcessivePos >= 0) {
243 if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
244 mExcessivePos = mOutputIndex;
245 }
246 if (mExcessivePos < mInputSize - 1) {
247 mExceeding = mExcessivePos == mInputIndex && canTryCorrection;
248 }
249 }
250
251 if (mSkipPos >= 0) {
252 if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
253 if (DEBUG_DICT) {
254 // TODO: Enable this assertion.
255 //ASSERT(mSkipPos == mOutputIndex - 1);
256 }
257 mSkipPos = mOutputIndex;
258 }
259 mSkipping = mSkipPos == mOutputIndex && canTryCorrection;
260 }
261
262 if (mTransposedPos >= 0) {
263 if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) {
264 mTransposedPos = mOutputIndex;
265 }
266 if (mTransposedPos < mInputSize - 1) {
267 mTransposing = mInputIndex == mTransposedPos && canTryCorrection;
268 }
269 }
270
271 bool secondTransposing = false;
272 if (mTransposedCount % 2 == 1) {
273 if (isEquivalentChar(mProximityInfoState.getProximityType(
274 mInputIndex - 1, c, false))) {
275 ++mTransposedCount;
276 secondTransposing = true;
277 } else if (mCorrectionStates[mOutputIndex].mExceeding) {
278 --mTransposedCount;
279 ++mExcessiveCount;
280 --mExcessivePos;
281 incrementInputIndex();
282 } else {
283 --mTransposedCount;
284 if (DEBUG_CORRECTION
285 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
286 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
287 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
288 DUMP_WORD(mWord, mOutputIndex);
289 AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
290 mTransposedCount, mExcessiveCount, c);
291 }
292 return processUnrelatedCorrectionType();
293 }
294 }
295
296 // TODO: Change the limit if we'll allow two or more proximity chars with corrections
297 // Work around: When the mMaxErrors is 1, we only allow just one error
298 // including proximity correction.
299 const bool checkProximityChars = (mMaxErrors > 1)
300 ? (noCorrectionsHappenedSoFar || mProximityCount == 0)
301 : (noCorrectionsHappenedSoFar && mProximityCount == 0);
302
303 ProximityType matchedProximityCharId = secondTransposing
304 ? MATCH_CHAR
305 : mProximityInfoState.getProximityType(
306 mInputIndex, c, checkProximityChars, &proximityIndex);
307
308 if (SUBSTITUTION_CHAR == matchedProximityCharId
309 || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
310 if (canTryCorrection && mOutputIndex > 0
311 && mCorrectionStates[mOutputIndex].mProximityMatching
312 && mCorrectionStates[mOutputIndex].mExceeding
313 && isEquivalentChar(mProximityInfoState.getProximityType(
314 mInputIndex, mWord[mOutputIndex - 1], false))) {
315 if (DEBUG_CORRECTION
316 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
317 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
318 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
319 AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
320 }
321 // Conversion p->e
322 // Example:
323 // wearth -> earth
324 // px -> (E)mmmmm
325 ++mExcessiveCount;
326 --mProximityCount;
327 mExcessivePos = mOutputIndex - 1;
328 ++mInputIndex;
329 // Here, we are doing something equivalent to matchedProximityCharId,
330 // but we already know that "excessive char correction" just happened
331 // so that we just need to check "mProximityCount == 0".
332 matchedProximityCharId = mProximityInfoState.getProximityType(
333 mInputIndex, c, mProximityCount == 0, &proximityIndex);
334 }
335 }
336
337 if (SUBSTITUTION_CHAR == matchedProximityCharId
338 || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
339 if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
340 mAdditionalProximityMatching = true;
341 }
342 // TODO: Optimize
343 // As the current char turned out to be an unrelated char,
344 // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex]
345 // here refers to the previous state.
346 if (mInputIndex < mInputSize - 1 && mOutputIndex > 0 && mTransposedCount > 0
347 && !mCorrectionStates[mOutputIndex].mTransposing
348 && mCorrectionStates[mOutputIndex - 1].mTransposing
349 && isEquivalentChar(mProximityInfoState.getProximityType(
350 mInputIndex, mWord[mOutputIndex - 1], false))
351 && isEquivalentChar(
352 mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) {
353 // Conversion t->e
354 // Example:
355 // occaisional -> occa sional
356 // mmmmttx -> mmmm(E)mmmmmm
357 mTransposedCount -= 2;
358 ++mExcessiveCount;
359 ++mInputIndex;
360 } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0
361 && !mCorrectionStates[mOutputIndex].mTransposing
362 && mCorrectionStates[mOutputIndex - 1].mTransposing
363 && isEquivalentChar(
364 mProximityInfoState.getProximityType(mInputIndex - 1, c, false))) {
365 // Conversion t->s
366 // Example:
367 // chcolate -> chocolate
368 // mmttx -> mmsmmmmmm
369 mTransposedCount -= 2;
370 ++mSkippedCount;
371 --mInputIndex;
372 } else if (canTryCorrection && mInputIndex > 0
373 && mCorrectionStates[mOutputIndex].mProximityMatching
374 && mCorrectionStates[mOutputIndex].mSkipping
375 && isEquivalentChar(
376 mProximityInfoState.getProximityType(mInputIndex - 1, c, false))) {
377 // Conversion p->s
378 // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
379 // proximity chars of "s", but it should rather be handled as a skipped char.
380 ++mSkippedCount;
381 --mProximityCount;
382 return processSkipChar(c, isTerminal, false);
383 } else if (mInputIndex - 1 < mInputSize
384 && mSkippedCount > 0
385 && mCorrectionStates[mOutputIndex].mSkipping
386 && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching
387 && isProximityCharOrEquivalentChar(
388 mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) {
389 // Conversion s->a
390 incrementInputIndex();
391 --mSkippedCount;
392 mProximityMatching = true;
393 ++mProximityCount;
394 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
395 } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputSize
396 && isEquivalentChar(
397 mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) {
398 // 1.2. Excessive or transpose correction
399 if (mTransposing) {
400 ++mTransposedCount;
401 } else {
402 ++mExcessiveCount;
403 incrementInputIndex();
404 }
405 if (DEBUG_CORRECTION
406 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
407 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
408 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
409 DUMP_WORD(mWord, mOutputIndex);
410 if (mTransposing) {
411 AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
412 mTransposedCount, mExcessiveCount, c);
413 } else {
414 AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
415 mTransposedCount, mExcessiveCount, c);
416 }
417 }
418 } else if (mSkipping) {
419 // 3. Skip correction
420 ++mSkippedCount;
421 if (DEBUG_CORRECTION
422 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
423 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
424 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
425 AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
426 mTransposedCount, mExcessiveCount, c);
427 }
428 return processSkipChar(c, isTerminal, false);
429 } else if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
430 // As a last resort, use additional proximity characters
431 mProximityMatching = true;
432 ++mProximityCount;
433 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
434 if (DEBUG_CORRECTION
435 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
436 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
437 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
438 AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
439 mTransposedCount, mExcessiveCount, c);
440 }
441 } else {
442 if (DEBUG_CORRECTION
443 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
444 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
445 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
446 DUMP_WORD(mWord, mOutputIndex);
447 AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
448 mTransposedCount, mExcessiveCount, c);
449 }
450 return processUnrelatedCorrectionType();
451 }
452 } else if (secondTransposing) {
453 // If inputIndex is greater than mInputSize, that means there is no
454 // proximity chars. So, we don't need to check proximity.
455 mMatching = true;
456 } else if (isEquivalentChar(matchedProximityCharId)) {
457 mMatching = true;
458 ++mEquivalentCharCount;
459 mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0);
460 } else if (PROXIMITY_CHAR == matchedProximityCharId) {
461 mProximityMatching = true;
462 ++mProximityCount;
463 mDistances[mOutputIndex] =
464 mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, proximityIndex);
465 if (DEBUG_CORRECTION
466 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
467 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
468 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
469 AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
470 mTransposedCount, mExcessiveCount, c);
471 }
472 }
473
474 addCharToCurrentWord(c);
475
476 // 4. Last char excessive correction
477 mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0
478 && mProximityCount == 0 && (mInputIndex == mInputSize - 2);
479 const bool isSameAsUserTypedLength = (mInputSize == mInputIndex + 1) || mLastCharExceeded;
480 if (mLastCharExceeded) {
481 ++mExcessiveCount;
482 }
483
484 // Start traversing all nodes after the index exceeds the user typed length
485 if (isSameAsUserTypedLength) {
486 startToTraverseAllNodes();
487 }
488
489 const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar =
490 mExceeding && mInputIndex == mInputSize - 2;
491
492 // Finally, we are ready to go to the next character, the next "virtual node".
493 // We should advance the input index.
494 // We do this in this branch of the 'if traverseAllNodes' because we are still matching
495 // characters to input; the other branch is not matching them but searching for
496 // completions, this is why it does not have to do it.
497 incrementInputIndex();
498 // Also, the next char is one "virtual node" depth more than this char.
499 incrementOutputIndex();
500
501 if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar
502 || isSameAsUserTypedLength) && isTerminal) {
503 mTerminalInputIndex = mInputIndex - 1;
504 mTerminalOutputIndex = mOutputIndex - 1;
505 if (DEBUG_CORRECTION
506 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
507 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
508 DUMP_WORD(mWord, mOutputIndex);
509 AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
510 mTransposedCount, mExcessiveCount, c);
511 }
512 return ON_TERMINAL;
513 } else {
514 mTerminalInputIndex = mInputIndex - 1;
515 mTerminalOutputIndex = mOutputIndex - 1;
516 return NOT_ON_TERMINAL;
517 }
518 }
519
getQuoteCount(const int * word,const int length)520 inline static int getQuoteCount(const int *word, const int length) {
521 int quoteCount = 0;
522 for (int i = 0; i < length; ++i) {
523 if (word[i] == KEYCODE_SINGLE_QUOTE) {
524 ++quoteCount;
525 }
526 }
527 return quoteCount;
528 }
529
isUpperCase(unsigned short c)530 inline static bool isUpperCase(unsigned short c) {
531 return isAsciiUpper(toBaseCodePoint(c));
532 }
533
534 //////////////////////
535 // RankingAlgorithm //
536 //////////////////////
537
calculateFinalProbability(const int inputIndex,const int outputIndex,const int freq,int * editDistanceTable,const Correction * correction,const int inputSize)538 /* static */ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex,
539 const int outputIndex, const int freq, int *editDistanceTable, const Correction *correction,
540 const int inputSize) {
541 const int excessivePos = correction->getExcessivePos();
542 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
543 const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
544 const ProximityInfoState *proximityInfoState = &correction->mProximityInfoState;
545 const int skippedCount = correction->mSkippedCount;
546 const int transposedCount = correction->mTransposedCount / 2;
547 const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2;
548 const int proximityMatchedCount = correction->mProximityCount;
549 const bool lastCharExceeded = correction->mLastCharExceeded;
550 const bool useFullEditDistance = correction->mUseFullEditDistance;
551 const int outputLength = outputIndex + 1;
552 if (skippedCount >= inputSize || inputSize == 0) {
553 return -1;
554 }
555
556 // TODO: find more robust way
557 bool sameLength = lastCharExceeded ? (inputSize == inputIndex + 2)
558 : (inputSize == inputIndex + 1);
559
560 // TODO: use mExcessiveCount
561 const int matchCount = inputSize - correction->mProximityCount - excessiveCount;
562
563 const int *word = correction->mWord;
564 const bool skipped = skippedCount > 0;
565
566 const int quoteDiffCount = max(0, getQuoteCount(word, outputLength)
567 - getQuoteCount(proximityInfoState->getPrimaryInputWord(), inputSize));
568
569 // TODO: Calculate edit distance for transposed and excessive
570 int ed = 0;
571 if (DEBUG_DICT_FULL) {
572 dumpEditDistance10ForDebug(editDistanceTable, correction->mInputSize, outputLength);
573 }
574 int adjustedProximityMatchedCount = proximityMatchedCount;
575
576 int finalFreq = freq;
577
578 if (DEBUG_CORRECTION_FREQ
579 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) {
580 AKLOGI("FinalFreq0: %d", finalFreq);
581 }
582 // TODO: Optimize this.
583 if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) {
584 ed = getCurrentEditDistance(editDistanceTable, correction->mInputSize, outputLength,
585 inputSize) - transposedCount;
586
587 const int matchWeight = powerIntCapped(typedLetterMultiplier,
588 max(inputSize, outputLength) - ed);
589 multiplyIntCapped(matchWeight, &finalFreq);
590
591 // TODO: Demote further if there are two or more excessive chars with longer user input?
592 if (inputSize > outputLength) {
593 multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
594 }
595
596 ed = max(0, ed - quoteDiffCount);
597 adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputSize)),
598 proximityMatchedCount);
599 if (transposedCount <= 0) {
600 if (ed == 1 && (inputSize == outputLength - 1 || inputSize == outputLength + 1)) {
601 // Promote a word with just one skipped or excessive char
602 if (sameLength) {
603 multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE
604 + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength,
605 &finalFreq);
606 } else {
607 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
608 }
609 } else if (ed == 0) {
610 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
611 sameLength = true;
612 }
613 }
614 } else {
615 const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
616 multiplyIntCapped(matchWeight, &finalFreq);
617 }
618
619 if (proximityInfoState->getProximityType(0, word[0], true) == SUBSTITUTION_CHAR) {
620 multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq);
621 }
622
623 ///////////////////////////////////////////////
624 // Promotion and Demotion for each correction
625
626 // Demotion for a word with missing character
627 if (skipped) {
628 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
629 * (10 * inputSize - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
630 / (10 * inputSize
631 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
632 if (DEBUG_DICT_FULL) {
633 AKLOGI("Demotion rate for missing character is %d.", demotionRate);
634 }
635 multiplyRate(demotionRate, &finalFreq);
636 }
637
638 // Demotion for a word with transposed character
639 if (transposedCount > 0) multiplyRate(
640 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
641
642 // Demotion for a word with excessive character
643 if (excessiveCount > 0) {
644 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
645 if (!lastCharExceeded && !proximityInfoState->existsAdjacentProximityChars(excessivePos)) {
646 if (DEBUG_DICT_FULL) {
647 AKLOGI("Double excessive demotion");
648 }
649 // If an excessive character is not adjacent to the left char or the right char,
650 // we will demote this word.
651 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
652 }
653 }
654
655 int additionalProximityCount = 0;
656 // Demote additional proximity characters
657 for (int i = 0; i < outputLength; ++i) {
658 const int squaredDistance = correction->mDistances[i];
659 if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
660 ++additionalProximityCount;
661 }
662 }
663
664 const bool performTouchPositionCorrection =
665 CALIBRATE_SCORE_BY_TOUCH_COORDINATES
666 && proximityInfoState->touchPositionCorrectionEnabled()
667 && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0
668 && additionalProximityCount == 0;
669
670 // Score calibration by touch coordinates is being done only for pure-fat finger typing error
671 // cases.
672 // TODO: Remove this constraint.
673 if (performTouchPositionCorrection) {
674 for (int i = 0; i < outputLength; ++i) {
675 const int squaredDistance = correction->mDistances[i];
676 if (i < adjustedProximityMatchedCount) {
677 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
678 }
679 const float factor =
680 SuggestUtils::getLengthScalingFactor(static_cast<float>(squaredDistance));
681 if (factor > 0.0f) {
682 multiplyRate(static_cast<int>(factor * 100.0f), &finalFreq);
683 } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) {
684 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
685 }
686 }
687 } else {
688 // Promotion for a word with proximity characters
689 for (int i = 0; i < adjustedProximityMatchedCount; ++i) {
690 // A word with proximity corrections
691 if (DEBUG_DICT_FULL) {
692 AKLOGI("Found a proximity correction.");
693 }
694 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
695 if (i < additionalProximityCount) {
696 multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
697 } else {
698 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
699 }
700 }
701 }
702
703 // If the user types too many(three or more) proximity characters with additional proximity
704 // character,do not treat as the same length word.
705 if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3
706 || transposedCount > 0 || skipped || excessiveCount > 0)) {
707 sameLength = false;
708 }
709
710 const int errorCount = adjustedProximityMatchedCount > 0
711 ? adjustedProximityMatchedCount
712 : (proximityMatchedCount + transposedCount);
713 multiplyRate(
714 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputSize, &finalFreq);
715
716 // Promotion for an exactly matched word
717 if (ed == 0) {
718 // Full exact match
719 if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0
720 && quoteDiffCount == 0 && additionalProximityCount == 0) {
721 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
722 }
723 }
724
725 // Promote a word with no correction
726 if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0
727 && additionalProximityCount == 0) {
728 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
729 }
730
731 // TODO: Check excessive count and transposed count
732 // TODO: Remove this if possible
733 /*
734 If the last character of the user input word is the same as the next character
735 of the output word, and also all of characters of the user input are matched
736 to the output word, we'll promote that word a bit because
737 that word can be considered the combination of skipped and matched characters.
738 This means that the 'sm' pattern wins over the 'ma' pattern.
739 e.g.)
740 shel -> shell [mmmma] or [mmmsm]
741 hel -> hello [mmmaa] or [mmsma]
742 m ... matching
743 s ... skipping
744 a ... traversing all
745 t ... transposing
746 e ... exceeding
747 p ... proximity matching
748 */
749 if (matchCount == inputSize && matchCount >= 2 && !skipped
750 && word[matchCount] == word[matchCount - 1]) {
751 multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
752 }
753
754 // TODO: Do not use sameLength?
755 if (sameLength) {
756 multiplyIntCapped(fullWordMultiplier, &finalFreq);
757 }
758
759 if (useFullEditDistance && outputLength > inputSize + 1) {
760 const int diff = outputLength - inputSize - 1;
761 const int divider = diff < 31 ? 1 << diff : S_INT_MAX;
762 finalFreq = divider > finalFreq ? 1 : finalFreq / divider;
763 }
764
765 if (DEBUG_DICT_FULL) {
766 AKLOGI("calc: %d, %d", outputLength, sameLength);
767 }
768
769 if (DEBUG_CORRECTION_FREQ
770 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) {
771 DUMP_WORD(correction->getPrimaryInputWord(), inputSize);
772 DUMP_WORD(correction->mWord, outputLength);
773 AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount,
774 skippedCount, transposedCount, excessiveCount, additionalProximityCount,
775 outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq);
776 }
777
778 return finalFreq;
779 }
780
calcFreqForSplitMultipleWords(const int * freqArray,const int * wordLengthArray,const int wordCount,const Correction * correction,const bool isSpaceProximity,const int * word)781 /* static */ int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(const int *freqArray,
782 const int *wordLengthArray, const int wordCount, const Correction *correction,
783 const bool isSpaceProximity, const int *word) {
784 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
785
786 bool firstCapitalizedWordDemotion = false;
787 bool secondCapitalizedWordDemotion = false;
788
789 {
790 // TODO: Handle multiple capitalized word demotion properly
791 const int firstWordLength = wordLengthArray[0];
792 const int secondWordLength = wordLengthArray[1];
793 if (firstWordLength >= 2) {
794 firstCapitalizedWordDemotion = isUpperCase(word[0]);
795 }
796
797 if (secondWordLength >= 2) {
798 // FIXME: word[firstWordLength + 1] is incorrect.
799 secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
800 }
801 }
802
803
804 const bool capitalizedWordDemotion =
805 firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
806
807 int totalLength = 0;
808 int totalFreq = 0;
809 for (int i = 0; i < wordCount; ++i) {
810 const int wordLength = wordLengthArray[i];
811 if (wordLength <= 0) {
812 return 0;
813 }
814 totalLength += wordLength;
815 const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
816 int tempFirstFreq = freqArray[i];
817 multiplyRate(demotionRate, &tempFirstFreq);
818 totalFreq += tempFirstFreq;
819 }
820
821 if (totalLength <= 0 || totalFreq <= 0) {
822 return 0;
823 }
824
825 // TODO: Currently totalFreq is adjusted to two word metrix.
826 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
827 // length.
828 totalFreq = totalFreq * 2 / wordCount;
829 if (wordCount > 2) {
830 // Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
831 int oneLengthCounter = 0;
832 int twoLengthCounter = 0;
833 for (int i = 0; i < wordCount; ++i) {
834 const int wordLength = wordLengthArray[i];
835 // TODO: Use bigram instead of this safety net
836 if (i < wordCount - 1) {
837 const int nextWordLength = wordLengthArray[i + 1];
838 if (wordLength == 1 && nextWordLength == 2) {
839 // Safety net to filter 1 length and 2 length sequential words
840 return 0;
841 }
842 }
843 const int freq = freqArray[i];
844 // Demote too short weak words
845 if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) {
846 multiplyRate(100 * freq / MAX_PROBABILITY, &totalFreq);
847 }
848 if (wordLength == 1) {
849 ++oneLengthCounter;
850 } else if (wordLength == 2) {
851 ++twoLengthCounter;
852 }
853 if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
854 // Safety net to filter too many short words
855 return 0;
856 }
857 }
858 multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
859 }
860
861 // This is a workaround to try offsetting the not-enough-demotion which will be done in
862 // calcNormalizedScore in Utils.java.
863 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
864 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
865 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
866 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
867 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
868
869 // At this moment, totalFreq is calculated by the following formula:
870 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
871 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
872
873 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
874
875 // This is another workaround to offset the demotion which will be done in
876 // calcNormalizedScore in Utils.java.
877 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
878 // the same amount because we already have adjusted the synthetic freq of this "missing or
879 // mistyped space" suggestion candidate above in this method.
880 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
881 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
882
883 if (isSpaceProximity) {
884 // A word pair with one space proximity correction
885 if (DEBUG_DICT) {
886 AKLOGI("Found a word pair with space proximity correction.");
887 }
888 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
889 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
890 }
891
892 if (isSpaceProximity) {
893 multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq);
894 } else {
895 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
896 }
897
898 if (capitalizedWordDemotion) {
899 multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
900 }
901
902 if (DEBUG_CORRECTION_FREQ) {
903 AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
904 wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
905 DUMP_WORD(word, wordLengthArray[0]);
906 }
907
908 return totalFreq;
909 }
910
editDistance(const int * before,const int beforeLength,const int * after,const int afterLength)911 /* static */ int Correction::RankingAlgorithm::editDistance(const int *before,
912 const int beforeLength, const int *after, const int afterLength) {
913 const DamerauLevenshteinEditDistancePolicy daemaruLevenshtein(
914 before, beforeLength, after, afterLength);
915 return static_cast<int>(EditDistance::getEditDistance(&daemaruLevenshtein));
916 }
917
918
919 // In dictionary.cpp, getSuggestion() method,
920 // When USE_SUGGEST_INTERFACE_FOR_TYPING is true:
921 // SUGGEST_INTERFACE_OUTPUT_SCALE was multiplied to the original suggestion scores to convert
922 // them to integers.
923 // score = (int)((original score) * SUGGEST_INTERFACE_OUTPUT_SCALE)
924 // Undo the scaling here to recover the original score.
925 // normalizedScore = ((float)score) / SUGGEST_INTERFACE_OUTPUT_SCALE
926 // Otherwise: suggestion scores are computed using the below formula.
927 // original score
928 // := powf(mTypedLetterMultiplier (this is defined 2),
929 // (the number of matched characters between typed word and suggested word))
930 // * (individual word's score which defined in the unigram dictionary,
931 // and this score is defined in range [0, 255].)
932 // Then, the following processing is applied.
933 // - If the dictionary word is matched up to the point of the user entry
934 // (full match up to min(before.length(), after.length())
935 // => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2)
936 // - If the word is a true full match except for differences in accents or
937 // capitalization, then treat it as if the score was 255.
938 // - If before.length() == after.length()
939 // => multiply by mFullWordMultiplier (this is defined 2))
940 // So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2
941 // For historical reasons we ignore the 1.2 modifier (because the measure for a good
942 // autocorrection threshold was done at a time when it didn't exist). This doesn't change
943 // the result.
944 // So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2.
945
calcNormalizedScore(const int * before,const int beforeLength,const int * after,const int afterLength,const int score)946 /* static */ float Correction::RankingAlgorithm::calcNormalizedScore(const int *before,
947 const int beforeLength, const int *after, const int afterLength, const int score) {
948 if (0 == beforeLength || 0 == afterLength) {
949 return 0.0f;
950 }
951 const int distance = editDistance(before, beforeLength, after, afterLength);
952 int spaceCount = 0;
953 for (int i = 0; i < afterLength; ++i) {
954 if (after[i] == KEYCODE_SPACE) {
955 ++spaceCount;
956 }
957 }
958
959 if (spaceCount == afterLength) {
960 return 0.0f;
961 }
962
963 // add a weight based on edit distance.
964 // distance <= max(afterLength, beforeLength) == afterLength,
965 // so, 0 <= distance / afterLength <= 1
966 const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength);
967
968 if (USE_SUGGEST_INTERFACE_FOR_TYPING) {
969 return (static_cast<float>(score) / SUGGEST_INTERFACE_OUTPUT_SCALE) * weight;
970 }
971 const float maxScore = score >= S_INT_MAX ? static_cast<float>(S_INT_MAX)
972 : static_cast<float>(MAX_INITIAL_SCORE)
973 * powf(static_cast<float>(TYPED_LETTER_MULTIPLIER),
974 static_cast<float>(min(beforeLength, afterLength - spaceCount)))
975 * static_cast<float>(FULL_WORD_MULTIPLIER);
976
977 return (static_cast<float>(score) / maxScore) * weight;
978 }
979 } // namespace latinime
980