1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <ctype.h>
19 #include <stdio.h>
20 #include <string.h>
21
22 #define LOG_TAG "LatinIME: correction.cpp"
23
24 #include "correction.h"
25 #include "dictionary.h"
26 #include "proximity_info.h"
27
28 namespace latinime {
29
30 //////////////////////
31 // inline functions //
32 //////////////////////
33 static const char QUOTE = '\'';
34
isQuote(const unsigned short c)35 inline bool Correction::isQuote(const unsigned short c) {
36 const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex);
37 return (c == QUOTE && userTypedChar != QUOTE);
38 }
39
40 ////////////////
41 // Correction //
42 ////////////////
43
Correction(const int typedLetterMultiplier,const int fullWordMultiplier)44 Correction::Correction(const int typedLetterMultiplier, const int fullWordMultiplier)
45 : TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {
46 }
47
initCorrection(const ProximityInfo * pi,const int inputLength,const int maxDepth)48 void Correction::initCorrection(const ProximityInfo *pi, const int inputLength,
49 const int maxDepth) {
50 mProximityInfo = pi;
51 mInputLength = inputLength;
52 mMaxDepth = maxDepth;
53 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
54 }
55
initCorrectionState(const int rootPos,const int childCount,const bool traverseAll)56 void Correction::initCorrectionState(
57 const int rootPos, const int childCount, const bool traverseAll) {
58 latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
59 // TODO: remove
60 mCorrectionStates[0].mTransposedPos = mTransposedPos;
61 mCorrectionStates[0].mExcessivePos = mExcessivePos;
62 mCorrectionStates[0].mSkipPos = mSkipPos;
63 }
64
setCorrectionParams(const int skipPos,const int excessivePos,const int transposedPos,const int spaceProximityPos,const int missingSpacePos,const bool useFullEditDistance)65 void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
66 const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
67 const bool useFullEditDistance) {
68 // TODO: remove
69 mTransposedPos = transposedPos;
70 mExcessivePos = excessivePos;
71 mSkipPos = skipPos;
72 // TODO: remove
73 mCorrectionStates[0].mTransposedPos = transposedPos;
74 mCorrectionStates[0].mExcessivePos = excessivePos;
75 mCorrectionStates[0].mSkipPos = skipPos;
76
77 mSpaceProximityPos = spaceProximityPos;
78 mMissingSpacePos = missingSpacePos;
79 mUseFullEditDistance = useFullEditDistance;
80 }
81
checkState()82 void Correction::checkState() {
83 if (DEBUG_DICT) {
84 int inputCount = 0;
85 if (mSkipPos >= 0) ++inputCount;
86 if (mExcessivePos >= 0) ++inputCount;
87 if (mTransposedPos >= 0) ++inputCount;
88 // TODO: remove this assert
89 assert(inputCount <= 1);
90 }
91 }
92
getFreqForSplitTwoWords(const int firstFreq,const int secondFreq,const unsigned short * word)93 int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
94 const unsigned short *word) {
95 return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
96 firstFreq, secondFreq, this, word);
97 }
98
getFinalFreq(const int freq,unsigned short ** word,int * wordLength)99 int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
100 const int outputIndex = mTerminalOutputIndex;
101 const int inputIndex = mTerminalInputIndex;
102 *wordLength = outputIndex + 1;
103 if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) {
104 return -1;
105 }
106
107 *word = mWord;
108 return Correction::RankingAlgorithm::calculateFinalFreq(
109 inputIndex, outputIndex, freq, mEditDistanceTable, this);
110 }
111
initProcessState(const int outputIndex)112 bool Correction::initProcessState(const int outputIndex) {
113 if (mCorrectionStates[outputIndex].mChildCount <= 0) {
114 return false;
115 }
116 mOutputIndex = outputIndex;
117 --(mCorrectionStates[outputIndex].mChildCount);
118 mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
119 mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
120
121 mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount;
122 mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
123 mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount;
124 mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount;
125 mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
126 mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded;
127
128 mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos;
129 mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos;
130 mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
131
132 mMatching = false;
133 mProximityMatching = false;
134 mTransposing = false;
135 mExceeding = false;
136 mSkipping = false;
137
138 return true;
139 }
140
goDownTree(const int parentIndex,const int childCount,const int firstChildPos)141 int Correction::goDownTree(
142 const int parentIndex, const int childCount, const int firstChildPos) {
143 mCorrectionStates[mOutputIndex].mParentIndex = parentIndex;
144 mCorrectionStates[mOutputIndex].mChildCount = childCount;
145 mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos;
146 return mOutputIndex;
147 }
148
149 // TODO: remove
getOutputIndex()150 int Correction::getOutputIndex() {
151 return mOutputIndex;
152 }
153
154 // TODO: remove
getInputIndex()155 int Correction::getInputIndex() {
156 return mInputIndex;
157 }
158
159 // TODO: remove
needsToTraverseAllNodes()160 bool Correction::needsToTraverseAllNodes() {
161 return mNeedsToTraverseAllNodes;
162 }
163
incrementInputIndex()164 void Correction::incrementInputIndex() {
165 ++mInputIndex;
166 }
167
incrementOutputIndex()168 void Correction::incrementOutputIndex() {
169 ++mOutputIndex;
170 mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex;
171 mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount;
172 mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos;
173 mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex;
174 mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes;
175
176 mCorrectionStates[mOutputIndex].mEquivalentCharCount = mEquivalentCharCount;
177 mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount;
178 mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount;
179 mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount;
180 mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount;
181
182 mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos;
183 mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos;
184 mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos;
185
186 mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded;
187
188 mCorrectionStates[mOutputIndex].mMatching = mMatching;
189 mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching;
190 mCorrectionStates[mOutputIndex].mTransposing = mTransposing;
191 mCorrectionStates[mOutputIndex].mExceeding = mExceeding;
192 mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
193 }
194
startToTraverseAllNodes()195 void Correction::startToTraverseAllNodes() {
196 mNeedsToTraverseAllNodes = true;
197 }
198
needsToPrune() const199 bool Correction::needsToPrune() const {
200 return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance;
201 }
202
203 // TODO: inline?
processSkipChar(const int32_t c,const bool isTerminal,const bool inputIndexIncremented)204 Correction::CorrectionType Correction::processSkipChar(
205 const int32_t c, const bool isTerminal, const bool inputIndexIncremented) {
206 mWord[mOutputIndex] = c;
207 if (needsToTraverseAllNodes() && isTerminal) {
208 mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0);
209 mTerminalOutputIndex = mOutputIndex;
210 incrementOutputIndex();
211 return TRAVERSE_ALL_ON_TERMINAL;
212 } else {
213 incrementOutputIndex();
214 return TRAVERSE_ALL_NOT_ON_TERMINAL;
215 }
216 }
217
isEquivalentChar(ProximityInfo::ProximityType type)218 inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
219 return type == ProximityInfo::EQUIVALENT_CHAR;
220 }
221
processCharAndCalcState(const int32_t c,const bool isTerminal)222 Correction::CorrectionType Correction::processCharAndCalcState(
223 const int32_t c, const bool isTerminal) {
224 const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
225 // TODO: Change the limit if we'll allow two or more corrections
226 const bool noCorrectionsHappenedSoFar = correctionCount == 0;
227 const bool canTryCorrection = noCorrectionsHappenedSoFar;
228 int proximityIndex = 0;
229 mDistances[mOutputIndex] = NOT_A_DISTANCE;
230
231 if (mNeedsToTraverseAllNodes || isQuote(c)) {
232 bool incremented = false;
233 if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
234 // TODO: Do not check the proximity if EditDistance exceeds the threshold
235 const ProximityInfo::ProximityType matchId =
236 mProximityInfo->getMatchedProximityId(mInputIndex, c, true, &proximityIndex);
237 if (isEquivalentChar(matchId)) {
238 mLastCharExceeded = false;
239 --mExcessiveCount;
240 mDistances[mOutputIndex] =
241 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0);
242 } else if (matchId == ProximityInfo::NEAR_PROXIMITY_CHAR) {
243 mLastCharExceeded = false;
244 --mExcessiveCount;
245 ++mProximityCount;
246 mDistances[mOutputIndex] =
247 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex);
248 }
249 incrementInputIndex();
250 incremented = true;
251 }
252 return processSkipChar(c, isTerminal, incremented);
253 }
254
255 if (mExcessivePos >= 0) {
256 if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
257 mExcessivePos = mOutputIndex;
258 }
259 if (mExcessivePos < mInputLength - 1) {
260 mExceeding = mExcessivePos == mInputIndex && canTryCorrection;
261 }
262 }
263
264 if (mSkipPos >= 0) {
265 if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
266 if (DEBUG_DICT) {
267 assert(mSkipPos == mOutputIndex - 1);
268 }
269 mSkipPos = mOutputIndex;
270 }
271 mSkipping = mSkipPos == mOutputIndex && canTryCorrection;
272 }
273
274 if (mTransposedPos >= 0) {
275 if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) {
276 mTransposedPos = mOutputIndex;
277 }
278 if (mTransposedPos < mInputLength - 1) {
279 mTransposing = mInputIndex == mTransposedPos && canTryCorrection;
280 }
281 }
282
283 bool secondTransposing = false;
284 if (mTransposedCount % 2 == 1) {
285 if (isEquivalentChar(mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
286 ++mTransposedCount;
287 secondTransposing = true;
288 } else if (mCorrectionStates[mOutputIndex].mExceeding) {
289 --mTransposedCount;
290 ++mExcessiveCount;
291 --mExcessivePos;
292 incrementInputIndex();
293 } else {
294 --mTransposedCount;
295 if (DEBUG_CORRECTION) {
296 DUMP_WORD(mWord, mOutputIndex);
297 LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
298 mTransposedCount, mExcessiveCount, c);
299 }
300 return UNRELATED;
301 }
302 }
303
304 // TODO: Change the limit if we'll allow two or more proximity chars with corrections
305 const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0;
306 ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
307 ? ProximityInfo::EQUIVALENT_CHAR
308 : mProximityInfo->getMatchedProximityId(
309 mInputIndex, c, checkProximityChars, &proximityIndex);
310
311 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
312 if (canTryCorrection && mOutputIndex > 0
313 && mCorrectionStates[mOutputIndex].mProximityMatching
314 && mCorrectionStates[mOutputIndex].mExceeding
315 && isEquivalentChar(mProximityInfo->getMatchedProximityId(
316 mInputIndex, mWord[mOutputIndex - 1], false))) {
317 if (DEBUG_CORRECTION) {
318 LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
319 }
320 // Conversion p->e
321 // Example:
322 // wearth -> earth
323 // px -> (E)mmmmm
324 ++mExcessiveCount;
325 --mProximityCount;
326 mExcessivePos = mOutputIndex - 1;
327 ++mInputIndex;
328 // Here, we are doing something equivalent to matchedProximityCharId,
329 // but we already know that "excessive char correction" just happened
330 // so that we just need to check "mProximityCount == 0".
331 matchedProximityCharId = mProximityInfo->getMatchedProximityId(
332 mInputIndex, c, mProximityCount == 0, &proximityIndex);
333 }
334 }
335
336 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
337 // TODO: Optimize
338 // As the current char turned out to be an unrelated char,
339 // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex]
340 // here refers to the previous state.
341 if (mInputIndex < mInputLength - 1 && mOutputIndex > 0 && mTransposedCount > 0
342 && !mCorrectionStates[mOutputIndex].mTransposing
343 && mCorrectionStates[mOutputIndex - 1].mTransposing
344 && isEquivalentChar(mProximityInfo->getMatchedProximityId(
345 mInputIndex, mWord[mOutputIndex - 1], false))
346 && isEquivalentChar(
347 mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
348 // Conversion t->e
349 // Example:
350 // occaisional -> occa sional
351 // mmmmttx -> mmmm(E)mmmmmm
352 mTransposedCount -= 2;
353 ++mExcessiveCount;
354 ++mInputIndex;
355 } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0
356 && !mCorrectionStates[mOutputIndex].mTransposing
357 && mCorrectionStates[mOutputIndex - 1].mTransposing
358 && isEquivalentChar(
359 mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
360 // Conversion t->s
361 // Example:
362 // chcolate -> chocolate
363 // mmttx -> mmsmmmmmm
364 mTransposedCount -= 2;
365 ++mSkippedCount;
366 --mInputIndex;
367 } else if (canTryCorrection && mInputIndex > 0
368 && mCorrectionStates[mOutputIndex].mProximityMatching
369 && mCorrectionStates[mOutputIndex].mSkipping
370 && isEquivalentChar(
371 mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
372 // Conversion p->s
373 // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
374 // proximity chars of "s", but it should rather be handled as a skipped char.
375 ++mSkippedCount;
376 --mProximityCount;
377 return processSkipChar(c, isTerminal, false);
378 } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength
379 && isEquivalentChar(
380 mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
381 // 1.2. Excessive or transpose correction
382 if (mTransposing) {
383 ++mTransposedCount;
384 } else {
385 ++mExcessiveCount;
386 incrementInputIndex();
387 }
388 } else if (mSkipping) {
389 // 3. Skip correction
390 ++mSkippedCount;
391 return processSkipChar(c, isTerminal, false);
392 } else {
393 if (DEBUG_CORRECTION) {
394 DUMP_WORD(mWord, mOutputIndex);
395 LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
396 mTransposedCount, mExcessiveCount, c);
397 }
398 return UNRELATED;
399 }
400 } else if (secondTransposing) {
401 // If inputIndex is greater than mInputLength, that means there is no
402 // proximity chars. So, we don't need to check proximity.
403 mMatching = true;
404 } else if (isEquivalentChar(matchedProximityCharId)) {
405 mMatching = true;
406 ++mEquivalentCharCount;
407 mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0);
408 } else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) {
409 mProximityMatching = true;
410 ++mProximityCount;
411 mDistances[mOutputIndex] =
412 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex);
413 }
414
415 mWord[mOutputIndex] = c;
416
417 // 4. Last char excessive correction
418 mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0
419 && mProximityCount == 0 && (mInputIndex == mInputLength - 2);
420 const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded;
421 if (mLastCharExceeded) {
422 ++mExcessiveCount;
423 }
424
425 // Start traversing all nodes after the index exceeds the user typed length
426 if (isSameAsUserTypedLength) {
427 startToTraverseAllNodes();
428 }
429
430 const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar =
431 mExceeding && mInputIndex == mInputLength - 2;
432
433 // Finally, we are ready to go to the next character, the next "virtual node".
434 // We should advance the input index.
435 // We do this in this branch of the 'if traverseAllNodes' because we are still matching
436 // characters to input; the other branch is not matching them but searching for
437 // completions, this is why it does not have to do it.
438 incrementInputIndex();
439 // Also, the next char is one "virtual node" depth more than this char.
440 incrementOutputIndex();
441
442 if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar
443 || isSameAsUserTypedLength) && isTerminal) {
444 mTerminalInputIndex = mInputIndex - 1;
445 mTerminalOutputIndex = mOutputIndex - 1;
446 if (DEBUG_CORRECTION) {
447 DUMP_WORD(mWord, mOutputIndex);
448 LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
449 mTransposedCount, mExcessiveCount, c);
450 }
451 return ON_TERMINAL;
452 } else {
453 return NOT_ON_TERMINAL;
454 }
455 }
456
~Correction()457 Correction::~Correction() {
458 }
459
460 /////////////////////////
461 // static inline utils //
462 /////////////////////////
463
464 static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num)465 static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
466 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
467 }
468
469 static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
multiplyIntCapped(const int multiplier,int * base)470 inline static void multiplyIntCapped(const int multiplier, int *base) {
471 const int temp = *base;
472 if (temp != S_INT_MAX) {
473 // Branch if multiplier == 2 for the optimization
474 if (multiplier == 2) {
475 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
476 } else {
477 // TODO: This overflow check gives a wrong answer when, for example,
478 // temp = 2^16 + 1 and multiplier = 2^17 + 1.
479 // Fix this behavior.
480 const int tempRetval = temp * multiplier;
481 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
482 }
483 }
484 }
485
powerIntCapped(const int base,const int n)486 inline static int powerIntCapped(const int base, const int n) {
487 if (n <= 0) return 1;
488 if (base == 2) {
489 return n < 31 ? 1 << n : S_INT_MAX;
490 } else {
491 int ret = base;
492 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
493 return ret;
494 }
495 }
496
multiplyRate(const int rate,int * freq)497 inline static void multiplyRate(const int rate, int *freq) {
498 if (*freq != S_INT_MAX) {
499 if (*freq > 1000000) {
500 *freq /= 100;
501 multiplyIntCapped(rate, freq);
502 } else {
503 multiplyIntCapped(rate, freq);
504 *freq /= 100;
505 }
506 }
507 }
508
getQuoteCount(const unsigned short * word,const int length)509 inline static int getQuoteCount(const unsigned short* word, const int length) {
510 int quoteCount = 0;
511 for (int i = 0; i < length; ++i) {
512 if(word[i] == '\'') {
513 ++quoteCount;
514 }
515 }
516 return quoteCount;
517 }
518
isUpperCase(unsigned short c)519 inline static bool isUpperCase(unsigned short c) {
520 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
521 c = BASE_CHARS[c];
522 }
523 if (isupper(c)) {
524 return true;
525 }
526 return false;
527 }
528
529 /* static */
editDistance(int * editDistanceTable,const unsigned short * input,const int inputLength,const unsigned short * output,const int outputLength)530 inline static int editDistance(
531 int* editDistanceTable, const unsigned short* input,
532 const int inputLength, const unsigned short* output, const int outputLength) {
533 // dp[li][lo] dp[a][b] = dp[ a * lo + b]
534 int* dp = editDistanceTable;
535 const int li = inputLength + 1;
536 const int lo = outputLength + 1;
537 for (int i = 0; i < li; ++i) {
538 dp[lo * i] = i;
539 }
540 for (int i = 0; i < lo; ++i) {
541 dp[i] = i;
542 }
543
544 for (int i = 0; i < li - 1; ++i) {
545 for (int j = 0; j < lo - 1; ++j) {
546 const uint32_t ci = Dictionary::toBaseLowerCase(input[i]);
547 const uint32_t co = Dictionary::toBaseLowerCase(output[j]);
548 const uint16_t cost = (ci == co) ? 0 : 1;
549 dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
550 min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
551 if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1])
552 && co == Dictionary::toBaseLowerCase(input[i - 1])) {
553 dp[(i + 1) * lo + (j + 1)] = min(
554 dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
555 }
556 }
557 }
558
559 if (DEBUG_EDIT_DISTANCE) {
560 LOGI("IN = %d, OUT = %d", inputLength, outputLength);
561 for (int i = 0; i < li; ++i) {
562 for (int j = 0; j < lo; ++j) {
563 LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
564 }
565 }
566 }
567 return dp[li * lo - 1];
568 }
569
570 //////////////////////
571 // RankingAlgorithm //
572 //////////////////////
573
574 /* static */
calculateFinalFreq(const int inputIndex,const int outputIndex,const int freq,int * editDistanceTable,const Correction * correction)575 int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
576 const int freq, int* editDistanceTable, const Correction* correction) {
577 const int excessivePos = correction->getExcessivePos();
578 const int inputLength = correction->mInputLength;
579 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
580 const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
581 const ProximityInfo *proximityInfo = correction->mProximityInfo;
582 const int skippedCount = correction->mSkippedCount;
583 const int transposedCount = correction->mTransposedCount / 2;
584 const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2;
585 const int proximityMatchedCount = correction->mProximityCount;
586 const bool lastCharExceeded = correction->mLastCharExceeded;
587 const bool useFullEditDistance = correction->mUseFullEditDistance;
588 const int outputLength = outputIndex + 1;
589 if (skippedCount >= inputLength || inputLength == 0) {
590 return -1;
591 }
592
593 // TODO: find more robust way
594 bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2)
595 : (inputLength == inputIndex + 1);
596
597 // TODO: use mExcessiveCount
598 const int matchCount = inputLength - correction->mProximityCount - excessiveCount;
599
600 const unsigned short* word = correction->mWord;
601 const bool skipped = skippedCount > 0;
602
603 const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
604 - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
605
606 // TODO: Calculate edit distance for transposed and excessive
607 int ed = 0;
608 int adjustedProximityMatchedCount = proximityMatchedCount;
609
610 int finalFreq = freq;
611
612 // TODO: Optimize this.
613 // TODO: Ignoring edit distance for transposed char, for now
614 if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) {
615 const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
616 ed = editDistance(editDistanceTable, primaryInputWord,
617 inputLength, word, outputIndex + 1);
618 const int matchWeight = powerIntCapped(typedLetterMultiplier,
619 max(inputLength, outputIndex + 1) - ed);
620 multiplyIntCapped(matchWeight, &finalFreq);
621
622 // TODO: Demote further if there are two or more excessive chars with longer user input?
623 if (inputLength > outputIndex + 1) {
624 multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
625 }
626
627 ed = max(0, ed - quoteDiffCount);
628
629 if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) {
630 // Promote a word with just one skipped or excessive char
631 if (sameLength) {
632 multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq);
633 } else {
634 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
635 }
636 } else if (ed == 0) {
637 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
638 sameLength = true;
639 }
640 adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
641 proximityMatchedCount);
642 } else {
643 // TODO: Calculate the edit distance for transposed char
644 const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
645 multiplyIntCapped(matchWeight, &finalFreq);
646 }
647
648 if (proximityInfo->getMatchedProximityId(0, word[0], true)
649 == ProximityInfo::UNRELATED_CHAR) {
650 multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq);
651 }
652
653 ///////////////////////////////////////////////
654 // Promotion and Demotion for each correction
655
656 // Demotion for a word with missing character
657 if (skipped) {
658 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
659 * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
660 / (10 * inputLength
661 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
662 if (DEBUG_DICT_FULL) {
663 LOGI("Demotion rate for missing character is %d.", demotionRate);
664 }
665 multiplyRate(demotionRate, &finalFreq);
666 }
667
668 // Demotion for a word with transposed character
669 if (transposedCount > 0) multiplyRate(
670 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
671
672 // Demotion for a word with excessive character
673 if (excessiveCount > 0) {
674 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
675 if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) {
676 if (DEBUG_CORRECTION_FREQ) {
677 LOGI("Double excessive demotion");
678 }
679 // If an excessive character is not adjacent to the left char or the right char,
680 // we will demote this word.
681 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
682 }
683 }
684
685 // Score calibration by touch coordinates is being done only for pure-fat finger typing error
686 // cases.
687 // TODO: Remove this constraint.
688 if (CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled()
689 && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0) {
690 for (int i = 0; i < outputLength; ++i) {
691 const int squaredDistance = correction->mDistances[i];
692 if (i < adjustedProximityMatchedCount) {
693 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
694 }
695 if (squaredDistance >= 0) {
696 // Promote or demote the score according to the distance from the sweet spot
697 static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f;
698 static const float B = 1.0f;
699 static const float C = 0.5f;
700 static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS;
701 static const float R2 = HALF_SCORE_SQUARED_RADIUS;
702 const float x = (float)squaredDistance
703 / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR;
704 const float factor = (x < R1)
705 ? (A * (R1 - x) + B * x) / R1
706 : (B * (R2 - x) + C * (x - R1)) / (R2 - R1);
707 // factor is piecewise linear function like:
708 // A -_ .
709 // ^-_ .
710 // B \ .
711 // \ .
712 // C \ .
713 // 0 R1 R2
714 multiplyRate((int)(factor * 100), &finalFreq);
715 } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) {
716 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
717 }
718 }
719 } else {
720 // Promotion for a word with proximity characters
721 for (int i = 0; i < adjustedProximityMatchedCount; ++i) {
722 // A word with proximity corrections
723 if (DEBUG_DICT_FULL) {
724 LOGI("Found a proximity correction.");
725 }
726 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
727 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
728 }
729 }
730
731 const int errorCount = adjustedProximityMatchedCount > 0
732 ? adjustedProximityMatchedCount
733 : (proximityMatchedCount + transposedCount);
734 multiplyRate(
735 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
736
737 // Promotion for an exactly matched word
738 if (ed == 0) {
739 // Full exact match
740 if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) {
741 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
742 }
743 }
744
745 // Promote a word with no correction
746 if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0) {
747 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
748 }
749
750 // TODO: Check excessive count and transposed count
751 // TODO: Remove this if possible
752 /*
753 If the last character of the user input word is the same as the next character
754 of the output word, and also all of characters of the user input are matched
755 to the output word, we'll promote that word a bit because
756 that word can be considered the combination of skipped and matched characters.
757 This means that the 'sm' pattern wins over the 'ma' pattern.
758 e.g.)
759 shel -> shell [mmmma] or [mmmsm]
760 hel -> hello [mmmaa] or [mmsma]
761 m ... matching
762 s ... skipping
763 a ... traversing all
764 t ... transposing
765 e ... exceeding
766 p ... proximity matching
767 */
768 if (matchCount == inputLength && matchCount >= 2 && !skipped
769 && word[matchCount] == word[matchCount - 1]) {
770 multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
771 }
772
773 // TODO: Do not use sameLength?
774 if (sameLength) {
775 multiplyIntCapped(fullWordMultiplier, &finalFreq);
776 }
777
778 if (useFullEditDistance && outputLength > inputLength + 1) {
779 const int diff = outputLength - inputLength - 1;
780 const int divider = diff < 31 ? 1 << diff : S_INT_MAX;
781 finalFreq = divider > finalFreq ? 1 : finalFreq / divider;
782 }
783
784 if (DEBUG_DICT_FULL) {
785 LOGI("calc: %d, %d", outputIndex, sameLength);
786 }
787
788 if (DEBUG_CORRECTION_FREQ) {
789 DUMP_WORD(correction->mWord, outputIndex + 1);
790 LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount,
791 skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength,
792 quoteDiffCount, ed, finalFreq);
793 }
794
795 return finalFreq;
796 }
797
798 /* static */
calcFreqForSplitTwoWords(const int firstFreq,const int secondFreq,const Correction * correction,const unsigned short * word)799 int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
800 const int firstFreq, const int secondFreq, const Correction* correction,
801 const unsigned short *word) {
802 const int spaceProximityPos = correction->mSpaceProximityPos;
803 const int missingSpacePos = correction->mMissingSpacePos;
804 if (DEBUG_DICT) {
805 int inputCount = 0;
806 if (spaceProximityPos >= 0) ++inputCount;
807 if (missingSpacePos >= 0) ++inputCount;
808 assert(inputCount <= 1);
809 }
810 const bool isSpaceProximity = spaceProximityPos >= 0;
811 const int inputLength = correction->mInputLength;
812 const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
813 const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
814 : (inputLength - missingSpacePos);
815 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
816
817 bool firstCapitalizedWordDemotion = false;
818 if (firstWordLength >= 2) {
819 firstCapitalizedWordDemotion = isUpperCase(word[0]);
820 }
821
822 bool secondCapitalizedWordDemotion = false;
823 if (secondWordLength >= 2) {
824 secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
825 }
826
827 const bool capitalizedWordDemotion =
828 firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
829
830 if (DEBUG_DICT_FULL) {
831 LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion);
832 }
833
834 if (firstWordLength == 0 || secondWordLength == 0) {
835 return 0;
836 }
837 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
838 int tempFirstFreq = firstFreq;
839 multiplyRate(firstDemotionRate, &tempFirstFreq);
840
841 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
842 int tempSecondFreq = secondFreq;
843 multiplyRate(secondDemotionRate, &tempSecondFreq);
844
845 const int totalLength = firstWordLength + secondWordLength;
846
847 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
848 // length.
849 int totalFreq = tempFirstFreq + tempSecondFreq;
850
851 // This is a workaround to try offsetting the not-enough-demotion which will be done in
852 // calcNormalizedScore in Utils.java.
853 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
854 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
855 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
856 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
857 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
858
859 // At this moment, totalFreq is calculated by the following formula:
860 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
861 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
862
863 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
864
865 // This is another workaround to offset the demotion which will be done in
866 // calcNormalizedScore in Utils.java.
867 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
868 // the same amount because we already have adjusted the synthetic freq of this "missing or
869 // mistyped space" suggestion candidate above in this method.
870 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
871 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
872
873 if (isSpaceProximity) {
874 // A word pair with one space proximity correction
875 if (DEBUG_DICT) {
876 LOGI("Found a word pair with space proximity correction.");
877 }
878 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
879 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
880 }
881
882 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
883
884 if (capitalizedWordDemotion) {
885 multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
886 }
887
888 return totalFreq;
889 }
890
891 } // namespace latinime
892