• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <ctype.h>
19 #include <stdio.h>
20 #include <string.h>
21 
22 #define LOG_TAG "LatinIME: correction.cpp"
23 
24 #include "correction.h"
25 #include "dictionary.h"
26 #include "proximity_info.h"
27 
28 namespace latinime {
29 
30 //////////////////////
31 // inline functions //
32 //////////////////////
33 static const char QUOTE = '\'';
34 
isQuote(const unsigned short c)35 inline bool Correction::isQuote(const unsigned short c) {
36     const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex);
37     return (c == QUOTE && userTypedChar != QUOTE);
38 }
39 
40 ////////////////
41 // Correction //
42 ////////////////
43 
Correction(const int typedLetterMultiplier,const int fullWordMultiplier)44 Correction::Correction(const int typedLetterMultiplier, const int fullWordMultiplier)
45         : TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {
46 }
47 
initCorrection(const ProximityInfo * pi,const int inputLength,const int maxDepth)48 void Correction::initCorrection(const ProximityInfo *pi, const int inputLength,
49         const int maxDepth) {
50     mProximityInfo = pi;
51     mInputLength = inputLength;
52     mMaxDepth = maxDepth;
53     mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
54 }
55 
initCorrectionState(const int rootPos,const int childCount,const bool traverseAll)56 void Correction::initCorrectionState(
57         const int rootPos, const int childCount, const bool traverseAll) {
58     latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
59     // TODO: remove
60     mCorrectionStates[0].mTransposedPos = mTransposedPos;
61     mCorrectionStates[0].mExcessivePos = mExcessivePos;
62     mCorrectionStates[0].mSkipPos = mSkipPos;
63 }
64 
setCorrectionParams(const int skipPos,const int excessivePos,const int transposedPos,const int spaceProximityPos,const int missingSpacePos,const bool useFullEditDistance)65 void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
66         const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
67         const bool useFullEditDistance) {
68     // TODO: remove
69     mTransposedPos = transposedPos;
70     mExcessivePos = excessivePos;
71     mSkipPos = skipPos;
72     // TODO: remove
73     mCorrectionStates[0].mTransposedPos = transposedPos;
74     mCorrectionStates[0].mExcessivePos = excessivePos;
75     mCorrectionStates[0].mSkipPos = skipPos;
76 
77     mSpaceProximityPos = spaceProximityPos;
78     mMissingSpacePos = missingSpacePos;
79     mUseFullEditDistance = useFullEditDistance;
80 }
81 
checkState()82 void Correction::checkState() {
83     if (DEBUG_DICT) {
84         int inputCount = 0;
85         if (mSkipPos >= 0) ++inputCount;
86         if (mExcessivePos >= 0) ++inputCount;
87         if (mTransposedPos >= 0) ++inputCount;
88         // TODO: remove this assert
89         assert(inputCount <= 1);
90     }
91 }
92 
getFreqForSplitTwoWords(const int firstFreq,const int secondFreq,const unsigned short * word)93 int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
94         const unsigned short *word) {
95     return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
96             firstFreq, secondFreq, this, word);
97 }
98 
getFinalFreq(const int freq,unsigned short ** word,int * wordLength)99 int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
100     const int outputIndex = mTerminalOutputIndex;
101     const int inputIndex = mTerminalInputIndex;
102     *wordLength = outputIndex + 1;
103     if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) {
104         return -1;
105     }
106 
107     *word = mWord;
108     return Correction::RankingAlgorithm::calculateFinalFreq(
109             inputIndex, outputIndex, freq, mEditDistanceTable, this);
110 }
111 
initProcessState(const int outputIndex)112 bool Correction::initProcessState(const int outputIndex) {
113     if (mCorrectionStates[outputIndex].mChildCount <= 0) {
114         return false;
115     }
116     mOutputIndex = outputIndex;
117     --(mCorrectionStates[outputIndex].mChildCount);
118     mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
119     mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
120 
121     mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount;
122     mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
123     mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount;
124     mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount;
125     mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
126     mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded;
127 
128     mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos;
129     mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos;
130     mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
131 
132     mMatching = false;
133     mProximityMatching = false;
134     mTransposing = false;
135     mExceeding = false;
136     mSkipping = false;
137 
138     return true;
139 }
140 
goDownTree(const int parentIndex,const int childCount,const int firstChildPos)141 int Correction::goDownTree(
142         const int parentIndex, const int childCount, const int firstChildPos) {
143     mCorrectionStates[mOutputIndex].mParentIndex = parentIndex;
144     mCorrectionStates[mOutputIndex].mChildCount = childCount;
145     mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos;
146     return mOutputIndex;
147 }
148 
149 // TODO: remove
getOutputIndex()150 int Correction::getOutputIndex() {
151     return mOutputIndex;
152 }
153 
154 // TODO: remove
getInputIndex()155 int Correction::getInputIndex() {
156     return mInputIndex;
157 }
158 
159 // TODO: remove
needsToTraverseAllNodes()160 bool Correction::needsToTraverseAllNodes() {
161     return mNeedsToTraverseAllNodes;
162 }
163 
incrementInputIndex()164 void Correction::incrementInputIndex() {
165     ++mInputIndex;
166 }
167 
incrementOutputIndex()168 void Correction::incrementOutputIndex() {
169     ++mOutputIndex;
170     mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex;
171     mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount;
172     mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos;
173     mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex;
174     mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes;
175 
176     mCorrectionStates[mOutputIndex].mEquivalentCharCount = mEquivalentCharCount;
177     mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount;
178     mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount;
179     mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount;
180     mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount;
181 
182     mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos;
183     mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos;
184     mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos;
185 
186     mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded;
187 
188     mCorrectionStates[mOutputIndex].mMatching = mMatching;
189     mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching;
190     mCorrectionStates[mOutputIndex].mTransposing = mTransposing;
191     mCorrectionStates[mOutputIndex].mExceeding = mExceeding;
192     mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
193 }
194 
startToTraverseAllNodes()195 void Correction::startToTraverseAllNodes() {
196     mNeedsToTraverseAllNodes = true;
197 }
198 
needsToPrune() const199 bool Correction::needsToPrune() const {
200     return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance;
201 }
202 
203 // TODO: inline?
processSkipChar(const int32_t c,const bool isTerminal,const bool inputIndexIncremented)204 Correction::CorrectionType Correction::processSkipChar(
205         const int32_t c, const bool isTerminal, const bool inputIndexIncremented) {
206     mWord[mOutputIndex] = c;
207     if (needsToTraverseAllNodes() && isTerminal) {
208         mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0);
209         mTerminalOutputIndex = mOutputIndex;
210         incrementOutputIndex();
211         return TRAVERSE_ALL_ON_TERMINAL;
212     } else {
213         incrementOutputIndex();
214         return TRAVERSE_ALL_NOT_ON_TERMINAL;
215     }
216 }
217 
isEquivalentChar(ProximityInfo::ProximityType type)218 inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
219     return type == ProximityInfo::EQUIVALENT_CHAR;
220 }
221 
processCharAndCalcState(const int32_t c,const bool isTerminal)222 Correction::CorrectionType Correction::processCharAndCalcState(
223         const int32_t c, const bool isTerminal) {
224     const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
225     // TODO: Change the limit if we'll allow two or more corrections
226     const bool noCorrectionsHappenedSoFar = correctionCount == 0;
227     const bool canTryCorrection = noCorrectionsHappenedSoFar;
228     int proximityIndex = 0;
229     mDistances[mOutputIndex] = NOT_A_DISTANCE;
230 
231     if (mNeedsToTraverseAllNodes || isQuote(c)) {
232         bool incremented = false;
233         if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
234             // TODO: Do not check the proximity if EditDistance exceeds the threshold
235             const ProximityInfo::ProximityType matchId =
236                     mProximityInfo->getMatchedProximityId(mInputIndex, c, true, &proximityIndex);
237             if (isEquivalentChar(matchId)) {
238                 mLastCharExceeded = false;
239                 --mExcessiveCount;
240                 mDistances[mOutputIndex] =
241                         mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0);
242             } else if (matchId == ProximityInfo::NEAR_PROXIMITY_CHAR) {
243                 mLastCharExceeded = false;
244                 --mExcessiveCount;
245                 ++mProximityCount;
246                 mDistances[mOutputIndex] =
247                         mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex);
248             }
249             incrementInputIndex();
250             incremented = true;
251         }
252         return processSkipChar(c, isTerminal, incremented);
253     }
254 
255     if (mExcessivePos >= 0) {
256         if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
257             mExcessivePos = mOutputIndex;
258         }
259         if (mExcessivePos < mInputLength - 1) {
260             mExceeding = mExcessivePos == mInputIndex && canTryCorrection;
261         }
262     }
263 
264     if (mSkipPos >= 0) {
265         if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
266             if (DEBUG_DICT) {
267                 assert(mSkipPos == mOutputIndex - 1);
268             }
269             mSkipPos = mOutputIndex;
270         }
271         mSkipping = mSkipPos == mOutputIndex && canTryCorrection;
272     }
273 
274     if (mTransposedPos >= 0) {
275         if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) {
276             mTransposedPos = mOutputIndex;
277         }
278         if (mTransposedPos < mInputLength - 1) {
279             mTransposing = mInputIndex == mTransposedPos && canTryCorrection;
280         }
281     }
282 
283     bool secondTransposing = false;
284     if (mTransposedCount % 2 == 1) {
285         if (isEquivalentChar(mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
286             ++mTransposedCount;
287             secondTransposing = true;
288         } else if (mCorrectionStates[mOutputIndex].mExceeding) {
289             --mTransposedCount;
290             ++mExcessiveCount;
291             --mExcessivePos;
292             incrementInputIndex();
293         } else {
294             --mTransposedCount;
295             if (DEBUG_CORRECTION) {
296                 DUMP_WORD(mWord, mOutputIndex);
297                 LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
298                         mTransposedCount, mExcessiveCount, c);
299             }
300             return UNRELATED;
301         }
302     }
303 
304     // TODO: Change the limit if we'll allow two or more proximity chars with corrections
305     const bool checkProximityChars = noCorrectionsHappenedSoFar ||  mProximityCount == 0;
306     ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
307             ? ProximityInfo::EQUIVALENT_CHAR
308             : mProximityInfo->getMatchedProximityId(
309                     mInputIndex, c, checkProximityChars, &proximityIndex);
310 
311     if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
312         if (canTryCorrection && mOutputIndex > 0
313                 && mCorrectionStates[mOutputIndex].mProximityMatching
314                 && mCorrectionStates[mOutputIndex].mExceeding
315                 && isEquivalentChar(mProximityInfo->getMatchedProximityId(
316                         mInputIndex, mWord[mOutputIndex - 1], false))) {
317             if (DEBUG_CORRECTION) {
318                 LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
319             }
320             // Conversion p->e
321             // Example:
322             // wearth ->    earth
323             // px     -> (E)mmmmm
324             ++mExcessiveCount;
325             --mProximityCount;
326             mExcessivePos = mOutputIndex - 1;
327             ++mInputIndex;
328             // Here, we are doing something equivalent to matchedProximityCharId,
329             // but we already know that "excessive char correction" just happened
330             // so that we just need to check "mProximityCount == 0".
331             matchedProximityCharId = mProximityInfo->getMatchedProximityId(
332                     mInputIndex, c, mProximityCount == 0, &proximityIndex);
333         }
334     }
335 
336     if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
337         // TODO: Optimize
338         // As the current char turned out to be an unrelated char,
339         // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex]
340         // here refers to the previous state.
341         if (mInputIndex < mInputLength - 1 && mOutputIndex > 0 && mTransposedCount > 0
342                 && !mCorrectionStates[mOutputIndex].mTransposing
343                 && mCorrectionStates[mOutputIndex - 1].mTransposing
344                 && isEquivalentChar(mProximityInfo->getMatchedProximityId(
345                         mInputIndex, mWord[mOutputIndex - 1], false))
346                 && isEquivalentChar(
347                         mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
348             // Conversion t->e
349             // Example:
350             // occaisional -> occa   sional
351             // mmmmttx     -> mmmm(E)mmmmmm
352             mTransposedCount -= 2;
353             ++mExcessiveCount;
354             ++mInputIndex;
355         } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0
356                 && !mCorrectionStates[mOutputIndex].mTransposing
357                 && mCorrectionStates[mOutputIndex - 1].mTransposing
358                 && isEquivalentChar(
359                         mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
360             // Conversion t->s
361             // Example:
362             // chcolate -> chocolate
363             // mmttx    -> mmsmmmmmm
364             mTransposedCount -= 2;
365             ++mSkippedCount;
366             --mInputIndex;
367         } else if (canTryCorrection && mInputIndex > 0
368                 && mCorrectionStates[mOutputIndex].mProximityMatching
369                 && mCorrectionStates[mOutputIndex].mSkipping
370                 && isEquivalentChar(
371                         mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
372             // Conversion p->s
373             // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
374             // proximity chars of "s", but it should rather be handled as a skipped char.
375             ++mSkippedCount;
376             --mProximityCount;
377             return processSkipChar(c, isTerminal, false);
378         } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength
379                 && isEquivalentChar(
380                         mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
381             // 1.2. Excessive or transpose correction
382             if (mTransposing) {
383                 ++mTransposedCount;
384             } else {
385                 ++mExcessiveCount;
386                 incrementInputIndex();
387             }
388         } else if (mSkipping) {
389             // 3. Skip correction
390             ++mSkippedCount;
391             return processSkipChar(c, isTerminal, false);
392         } else {
393             if (DEBUG_CORRECTION) {
394                 DUMP_WORD(mWord, mOutputIndex);
395                 LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
396                         mTransposedCount, mExcessiveCount, c);
397             }
398             return UNRELATED;
399         }
400     } else if (secondTransposing) {
401         // If inputIndex is greater than mInputLength, that means there is no
402         // proximity chars. So, we don't need to check proximity.
403         mMatching = true;
404     } else if (isEquivalentChar(matchedProximityCharId)) {
405         mMatching = true;
406         ++mEquivalentCharCount;
407         mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0);
408     } else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) {
409         mProximityMatching = true;
410         ++mProximityCount;
411         mDistances[mOutputIndex] =
412                 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex);
413     }
414 
415     mWord[mOutputIndex] = c;
416 
417     // 4. Last char excessive correction
418     mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0
419             && mProximityCount == 0 && (mInputIndex == mInputLength - 2);
420     const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded;
421     if (mLastCharExceeded) {
422         ++mExcessiveCount;
423     }
424 
425     // Start traversing all nodes after the index exceeds the user typed length
426     if (isSameAsUserTypedLength) {
427         startToTraverseAllNodes();
428     }
429 
430     const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar =
431             mExceeding && mInputIndex == mInputLength - 2;
432 
433     // Finally, we are ready to go to the next character, the next "virtual node".
434     // We should advance the input index.
435     // We do this in this branch of the 'if traverseAllNodes' because we are still matching
436     // characters to input; the other branch is not matching them but searching for
437     // completions, this is why it does not have to do it.
438     incrementInputIndex();
439     // Also, the next char is one "virtual node" depth more than this char.
440     incrementOutputIndex();
441 
442     if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar
443             || isSameAsUserTypedLength) && isTerminal) {
444         mTerminalInputIndex = mInputIndex - 1;
445         mTerminalOutputIndex = mOutputIndex - 1;
446         if (DEBUG_CORRECTION) {
447             DUMP_WORD(mWord, mOutputIndex);
448             LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
449                     mTransposedCount, mExcessiveCount, c);
450         }
451         return ON_TERMINAL;
452     } else {
453         return NOT_ON_TERMINAL;
454     }
455 }
456 
~Correction()457 Correction::~Correction() {
458 }
459 
460 /////////////////////////
461 // static inline utils //
462 /////////////////////////
463 
464 static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num)465 static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
466     return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
467 }
468 
469 static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
multiplyIntCapped(const int multiplier,int * base)470 inline static void multiplyIntCapped(const int multiplier, int *base) {
471     const int temp = *base;
472     if (temp != S_INT_MAX) {
473         // Branch if multiplier == 2 for the optimization
474         if (multiplier == 2) {
475             *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
476         } else {
477             // TODO: This overflow check gives a wrong answer when, for example,
478             //       temp = 2^16 + 1 and multiplier = 2^17 + 1.
479             //       Fix this behavior.
480             const int tempRetval = temp * multiplier;
481             *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
482         }
483     }
484 }
485 
powerIntCapped(const int base,const int n)486 inline static int powerIntCapped(const int base, const int n) {
487     if (n <= 0) return 1;
488     if (base == 2) {
489         return n < 31 ? 1 << n : S_INT_MAX;
490     } else {
491         int ret = base;
492         for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
493         return ret;
494     }
495 }
496 
multiplyRate(const int rate,int * freq)497 inline static void multiplyRate(const int rate, int *freq) {
498     if (*freq != S_INT_MAX) {
499         if (*freq > 1000000) {
500             *freq /= 100;
501             multiplyIntCapped(rate, freq);
502         } else {
503             multiplyIntCapped(rate, freq);
504             *freq /= 100;
505         }
506     }
507 }
508 
getQuoteCount(const unsigned short * word,const int length)509 inline static int getQuoteCount(const unsigned short* word, const int length) {
510     int quoteCount = 0;
511     for (int i = 0; i < length; ++i) {
512         if(word[i] == '\'') {
513             ++quoteCount;
514         }
515     }
516     return quoteCount;
517 }
518 
isUpperCase(unsigned short c)519 inline static bool isUpperCase(unsigned short c) {
520      if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
521          c = BASE_CHARS[c];
522      }
523      if (isupper(c)) {
524          return true;
525      }
526      return false;
527 }
528 
529 /* static */
editDistance(int * editDistanceTable,const unsigned short * input,const int inputLength,const unsigned short * output,const int outputLength)530 inline static int editDistance(
531         int* editDistanceTable, const unsigned short* input,
532         const int inputLength, const unsigned short* output, const int outputLength) {
533     // dp[li][lo] dp[a][b] = dp[ a * lo + b]
534     int* dp = editDistanceTable;
535     const int li = inputLength + 1;
536     const int lo = outputLength + 1;
537     for (int i = 0; i < li; ++i) {
538         dp[lo * i] = i;
539     }
540     for (int i = 0; i < lo; ++i) {
541         dp[i] = i;
542     }
543 
544     for (int i = 0; i < li - 1; ++i) {
545         for (int j = 0; j < lo - 1; ++j) {
546             const uint32_t ci = Dictionary::toBaseLowerCase(input[i]);
547             const uint32_t co = Dictionary::toBaseLowerCase(output[j]);
548             const uint16_t cost = (ci == co) ? 0 : 1;
549             dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
550                     min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
551             if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1])
552                     && co == Dictionary::toBaseLowerCase(input[i - 1])) {
553                 dp[(i + 1) * lo + (j + 1)] = min(
554                         dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
555             }
556         }
557     }
558 
559     if (DEBUG_EDIT_DISTANCE) {
560         LOGI("IN = %d, OUT = %d", inputLength, outputLength);
561         for (int i = 0; i < li; ++i) {
562             for (int j = 0; j < lo; ++j) {
563                 LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
564             }
565         }
566     }
567     return dp[li * lo - 1];
568 }
569 
570 //////////////////////
571 // RankingAlgorithm //
572 //////////////////////
573 
574 /* static */
calculateFinalFreq(const int inputIndex,const int outputIndex,const int freq,int * editDistanceTable,const Correction * correction)575 int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
576         const int freq, int* editDistanceTable, const Correction* correction) {
577     const int excessivePos = correction->getExcessivePos();
578     const int inputLength = correction->mInputLength;
579     const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
580     const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
581     const ProximityInfo *proximityInfo = correction->mProximityInfo;
582     const int skippedCount = correction->mSkippedCount;
583     const int transposedCount = correction->mTransposedCount / 2;
584     const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2;
585     const int proximityMatchedCount = correction->mProximityCount;
586     const bool lastCharExceeded = correction->mLastCharExceeded;
587     const bool useFullEditDistance = correction->mUseFullEditDistance;
588     const int outputLength = outputIndex + 1;
589     if (skippedCount >= inputLength || inputLength == 0) {
590         return -1;
591     }
592 
593     // TODO: find more robust way
594     bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2)
595             : (inputLength == inputIndex + 1);
596 
597     // TODO: use mExcessiveCount
598     const int matchCount = inputLength - correction->mProximityCount - excessiveCount;
599 
600     const unsigned short* word = correction->mWord;
601     const bool skipped = skippedCount > 0;
602 
603     const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
604             - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
605 
606     // TODO: Calculate edit distance for transposed and excessive
607     int ed = 0;
608     int adjustedProximityMatchedCount = proximityMatchedCount;
609 
610     int finalFreq = freq;
611 
612     // TODO: Optimize this.
613     // TODO: Ignoring edit distance for transposed char, for now
614     if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) {
615         const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
616         ed = editDistance(editDistanceTable, primaryInputWord,
617                 inputLength, word, outputIndex + 1);
618         const int matchWeight = powerIntCapped(typedLetterMultiplier,
619                 max(inputLength, outputIndex + 1) - ed);
620         multiplyIntCapped(matchWeight, &finalFreq);
621 
622         // TODO: Demote further if there are two or more excessive chars with longer user input?
623         if (inputLength > outputIndex + 1) {
624             multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
625         }
626 
627         ed = max(0, ed - quoteDiffCount);
628 
629         if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) {
630             // Promote a word with just one skipped or excessive char
631             if (sameLength) {
632                 multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq);
633             } else {
634                 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
635             }
636         } else if (ed == 0) {
637             multiplyIntCapped(typedLetterMultiplier, &finalFreq);
638             sameLength = true;
639         }
640         adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
641                 proximityMatchedCount);
642     } else {
643         // TODO: Calculate the edit distance for transposed char
644         const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
645         multiplyIntCapped(matchWeight, &finalFreq);
646     }
647 
648     if (proximityInfo->getMatchedProximityId(0, word[0], true)
649             == ProximityInfo::UNRELATED_CHAR) {
650         multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq);
651     }
652 
653     ///////////////////////////////////////////////
654     // Promotion and Demotion for each correction
655 
656     // Demotion for a word with missing character
657     if (skipped) {
658         const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
659                 * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
660                 / (10 * inputLength
661                         - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
662         if (DEBUG_DICT_FULL) {
663             LOGI("Demotion rate for missing character is %d.", demotionRate);
664         }
665         multiplyRate(demotionRate, &finalFreq);
666     }
667 
668     // Demotion for a word with transposed character
669     if (transposedCount > 0) multiplyRate(
670             WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
671 
672     // Demotion for a word with excessive character
673     if (excessiveCount > 0) {
674         multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
675         if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) {
676             if (DEBUG_CORRECTION_FREQ) {
677                 LOGI("Double excessive demotion");
678             }
679             // If an excessive character is not adjacent to the left char or the right char,
680             // we will demote this word.
681             multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
682         }
683     }
684 
685     // Score calibration by touch coordinates is being done only for pure-fat finger typing error
686     // cases.
687     // TODO: Remove this constraint.
688     if (CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled()
689             && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0) {
690         for (int i = 0; i < outputLength; ++i) {
691             const int squaredDistance = correction->mDistances[i];
692             if (i < adjustedProximityMatchedCount) {
693                 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
694             }
695             if (squaredDistance >= 0) {
696                 // Promote or demote the score according to the distance from the sweet spot
697                 static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f;
698                 static const float B = 1.0f;
699                 static const float C = 0.5f;
700                 static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS;
701                 static const float R2 = HALF_SCORE_SQUARED_RADIUS;
702                 const float x = (float)squaredDistance
703                         / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR;
704                 const float factor = (x < R1)
705                     ? (A * (R1 - x) + B * x) / R1
706                     : (B * (R2 - x) + C * (x - R1)) / (R2 - R1);
707                 // factor is piecewise linear function like:
708                 // A -_                  .
709                 //     ^-_               .
710                 // B      \              .
711                 //         \             .
712                 // C        \            .
713                 //   0   R1 R2
714                 multiplyRate((int)(factor * 100), &finalFreq);
715             } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) {
716                 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
717             }
718         }
719     } else {
720         // Promotion for a word with proximity characters
721         for (int i = 0; i < adjustedProximityMatchedCount; ++i) {
722             // A word with proximity corrections
723             if (DEBUG_DICT_FULL) {
724                 LOGI("Found a proximity correction.");
725             }
726             multiplyIntCapped(typedLetterMultiplier, &finalFreq);
727             multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
728         }
729     }
730 
731     const int errorCount = adjustedProximityMatchedCount > 0
732             ? adjustedProximityMatchedCount
733             : (proximityMatchedCount + transposedCount);
734     multiplyRate(
735             100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
736 
737     // Promotion for an exactly matched word
738     if (ed == 0) {
739         // Full exact match
740         if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) {
741             finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
742         }
743     }
744 
745     // Promote a word with no correction
746     if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0) {
747         multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
748     }
749 
750     // TODO: Check excessive count and transposed count
751     // TODO: Remove this if possible
752     /*
753          If the last character of the user input word is the same as the next character
754          of the output word, and also all of characters of the user input are matched
755          to the output word, we'll promote that word a bit because
756          that word can be considered the combination of skipped and matched characters.
757          This means that the 'sm' pattern wins over the 'ma' pattern.
758          e.g.)
759          shel -> shell [mmmma] or [mmmsm]
760          hel -> hello [mmmaa] or [mmsma]
761          m ... matching
762          s ... skipping
763          a ... traversing all
764          t ... transposing
765          e ... exceeding
766          p ... proximity matching
767      */
768     if (matchCount == inputLength && matchCount >= 2 && !skipped
769             && word[matchCount] == word[matchCount - 1]) {
770         multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
771     }
772 
773     // TODO: Do not use sameLength?
774     if (sameLength) {
775         multiplyIntCapped(fullWordMultiplier, &finalFreq);
776     }
777 
778     if (useFullEditDistance && outputLength > inputLength + 1) {
779         const int diff = outputLength - inputLength - 1;
780         const int divider = diff < 31 ? 1 << diff : S_INT_MAX;
781         finalFreq = divider > finalFreq ? 1 : finalFreq / divider;
782     }
783 
784     if (DEBUG_DICT_FULL) {
785         LOGI("calc: %d, %d", outputIndex, sameLength);
786     }
787 
788     if (DEBUG_CORRECTION_FREQ) {
789         DUMP_WORD(correction->mWord, outputIndex + 1);
790         LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount,
791                 skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength,
792                 quoteDiffCount, ed, finalFreq);
793     }
794 
795     return finalFreq;
796 }
797 
798 /* static */
calcFreqForSplitTwoWords(const int firstFreq,const int secondFreq,const Correction * correction,const unsigned short * word)799 int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
800         const int firstFreq, const int secondFreq, const Correction* correction,
801         const unsigned short *word) {
802     const int spaceProximityPos = correction->mSpaceProximityPos;
803     const int missingSpacePos = correction->mMissingSpacePos;
804     if (DEBUG_DICT) {
805         int inputCount = 0;
806         if (spaceProximityPos >= 0) ++inputCount;
807         if (missingSpacePos >= 0) ++inputCount;
808         assert(inputCount <= 1);
809     }
810     const bool isSpaceProximity = spaceProximityPos >= 0;
811     const int inputLength = correction->mInputLength;
812     const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
813     const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
814             : (inputLength - missingSpacePos);
815     const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
816 
817     bool firstCapitalizedWordDemotion = false;
818     if (firstWordLength >= 2) {
819         firstCapitalizedWordDemotion = isUpperCase(word[0]);
820     }
821 
822     bool secondCapitalizedWordDemotion = false;
823     if (secondWordLength >= 2) {
824         secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
825     }
826 
827     const bool capitalizedWordDemotion =
828             firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
829 
830     if (DEBUG_DICT_FULL) {
831         LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion);
832     }
833 
834     if (firstWordLength == 0 || secondWordLength == 0) {
835         return 0;
836     }
837     const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
838     int tempFirstFreq = firstFreq;
839     multiplyRate(firstDemotionRate, &tempFirstFreq);
840 
841     const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
842     int tempSecondFreq = secondFreq;
843     multiplyRate(secondDemotionRate, &tempSecondFreq);
844 
845     const int totalLength = firstWordLength + secondWordLength;
846 
847     // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
848     // length.
849     int totalFreq = tempFirstFreq + tempSecondFreq;
850 
851     // This is a workaround to try offsetting the not-enough-demotion which will be done in
852     // calcNormalizedScore in Utils.java.
853     // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
854     // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
855     // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
856     const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
857     multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
858 
859     // At this moment, totalFreq is calculated by the following formula:
860     // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
861     //        * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
862 
863     multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
864 
865     // This is another workaround to offset the demotion which will be done in
866     // calcNormalizedScore in Utils.java.
867     // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
868     // the same amount because we already have adjusted the synthetic freq of this "missing or
869     // mistyped space" suggestion candidate above in this method.
870     const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
871     multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
872 
873     if (isSpaceProximity) {
874         // A word pair with one space proximity correction
875         if (DEBUG_DICT) {
876             LOGI("Found a word pair with space proximity correction.");
877         }
878         multiplyIntCapped(typedLetterMultiplier, &totalFreq);
879         multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
880     }
881 
882     multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
883 
884     if (capitalizedWordDemotion) {
885         multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
886     }
887 
888     return totalFreq;
889 }
890 
891 } // namespace latinime
892