1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts.aggregation.util; 17 18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; 19 import com.android.providers.contacts.util.Hex; 20 21 import android.util.Log; 22 23 import java.util.ArrayList; 24 import java.util.Collections; 25 import java.util.HashMap; 26 import java.util.List; 27 28 /** 29 * Logic for matching contacts' data and accumulating match scores. 30 */ 31 public class ContactMatcher { 32 private static final String TAG = "ContactMatcher"; 33 34 // Best possible match score 35 public static final int MAX_SCORE = 100; 36 37 // Suggest to aggregate contacts if their match score is equal or greater than this threshold 38 public static final int SCORE_THRESHOLD_SUGGEST = 50; 39 40 // Automatically aggregate contacts if their match score is equal or greater than this threshold 41 public static final int SCORE_THRESHOLD_PRIMARY = 70; 42 43 // Automatically aggregate contacts if the match score is equal or greater than this threshold 44 // and there is a secondary match (phone number, email etc). 45 public static final int SCORE_THRESHOLD_SECONDARY = 50; 46 47 // Score for missing data (as opposed to present data but a bad match) 48 private static final int NO_DATA_SCORE = -1; 49 50 // Score for matching phone numbers 51 private static final int PHONE_MATCH_SCORE = 71; 52 53 // Score for matching email addresses 54 private static final int EMAIL_MATCH_SCORE = 71; 55 56 // Score for matching nickname 57 private static final int NICKNAME_MATCH_SCORE = 71; 58 59 // Maximum number of characters in a name to be considered by the matching algorithm. 60 private static final int MAX_MATCHED_NAME_LENGTH = 30; 61 62 // Scores a multiplied by this number to allow room for "fractional" scores 63 private static final int SCORE_SCALE = 1000; 64 65 public static final int MATCHING_ALGORITHM_EXACT = 0; 66 public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; 67 public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; 68 69 // Minimum edit distance between two names to be considered an approximate match 70 public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; 71 72 // Minimum edit distance between two email ids to be considered an approximate match 73 public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; 74 75 // Returned value when we found multiple matches and that was not allowed 76 public static final long MULTIPLE_MATCHES = -2; 77 78 /** 79 * Name matching scores: a matrix by name type vs. candidate lookup type. 80 * For example, if the name type is "full name" while we are looking for a 81 * "full name", the score may be 99. If we are looking for a "nickname" but 82 * find "first name", the score may be 50 (see specific scores defined 83 * below.) 84 * <p> 85 * For approximate matching, we have a range of scores, let's say 40-70. Depending one how 86 * similar the two strings are, the score will be somewhere between 40 and 70, with the exact 87 * match producing the score of 70. The score may also be 0 if the similarity (distance) 88 * between the strings is below the threshold. 89 * <p> 90 * We use a string matching algorithm, which is particularly suited for 91 * name matching. See {@link NameDistance}. 92 */ 93 private static int[] sMinScore = 94 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 95 private static int[] sMaxScore = 96 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 97 98 /* 99 * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, 100 * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are 101 * not! They are useful in three-way aggregation cases when we have, for example, both 102 * John Smith and Smith John. A third contact with the name John Smith should be aggregated 103 * with the former rather than the latter. This is why "reverse" matches have slightly lower 104 * scores than direct matches. 105 */ 106 static { setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)107 setScoreRange(NameLookupType.NAME_EXACT, 108 NameLookupType.NAME_EXACT, 99, 99); setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)109 setScoreRange(NameLookupType.NAME_VARIANT, 110 NameLookupType.NAME_VARIANT, 90, 90); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)111 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 112 NameLookupType.NAME_COLLATION_KEY, 50, 80); 113 setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)114 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 115 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)116 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 117 NameLookupType.NICKNAME, 50, 60); 118 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)119 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 120 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)121 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 122 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)123 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 124 NameLookupType.NICKNAME, 50, 60); 125 setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)126 setScoreRange(NameLookupType.NICKNAME, 127 NameLookupType.NICKNAME, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)128 setScoreRange(NameLookupType.NICKNAME, 129 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)130 setScoreRange(NameLookupType.NICKNAME, 131 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 132 } 133 134 /** 135 * Populates the cells of the score matrix and score span matrix 136 * corresponding to the {@code candidateNameType} and {@code nameType}. 137 */ setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)138 private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) { 139 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 140 sMinScore[index] = scoreFrom; 141 sMaxScore[index] = scoreTo; 142 } 143 144 /** 145 * Returns the lower range for the match score for the given {@code candidateNameType} and 146 * {@code nameType}. 147 */ getMinScore(int candidateNameType, int nameType)148 private static int getMinScore(int candidateNameType, int nameType) { 149 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 150 return sMinScore[index]; 151 } 152 153 /** 154 * Returns the upper range for the match score for the given {@code candidateNameType} and 155 * {@code nameType}. 156 */ getMaxScore(int candidateNameType, int nameType)157 private static int getMaxScore(int candidateNameType, int nameType) { 158 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 159 return sMaxScore[index]; 160 } 161 162 /** 163 * Captures the max score and match count for a specific contact. Used in an 164 * contactId - MatchScore map. 165 */ 166 public static class MatchScore implements Comparable<MatchScore> { 167 private long mContactId; 168 private boolean mKeepIn; 169 private boolean mKeepOut; 170 private int mPrimaryScore; 171 private int mSecondaryScore; 172 private int mMatchCount; 173 MatchScore(long contactId)174 public MatchScore(long contactId) { 175 this.mContactId = contactId; 176 } 177 reset(long contactId)178 public void reset(long contactId) { 179 this.mContactId = contactId; 180 mKeepIn = false; 181 mKeepOut = false; 182 mPrimaryScore = 0; 183 mSecondaryScore = 0; 184 mMatchCount = 0; 185 } 186 getContactId()187 public long getContactId() { 188 return mContactId; 189 } 190 updatePrimaryScore(int score)191 public void updatePrimaryScore(int score) { 192 if (score > mPrimaryScore) { 193 mPrimaryScore = score; 194 } 195 mMatchCount++; 196 } 197 updateSecondaryScore(int score)198 public void updateSecondaryScore(int score) { 199 if (score > mSecondaryScore) { 200 mSecondaryScore = score; 201 } 202 mMatchCount++; 203 } 204 keepIn()205 public void keepIn() { 206 mKeepIn = true; 207 } 208 keepOut()209 public void keepOut() { 210 mKeepOut = true; 211 } 212 getScore()213 public int getScore() { 214 if (mKeepOut) { 215 return 0; 216 } 217 218 if (mKeepIn) { 219 return MAX_SCORE; 220 } 221 222 int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore); 223 224 // Ensure that of two contacts with the same match score the one with more matching 225 // data elements wins. 226 return score * SCORE_SCALE + mMatchCount; 227 } 228 229 /** 230 * Descending order of match score. 231 */ 232 @Override compareTo(MatchScore another)233 public int compareTo(MatchScore another) { 234 return another.getScore() - getScore(); 235 } 236 237 @Override toString()238 public String toString() { 239 return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount 240 + ")"; 241 } 242 } 243 244 private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>(); 245 private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); 246 private int mScoreCount = 0; 247 248 private final NameDistance mNameDistanceConservative = new NameDistance(); 249 private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); 250 getMatchingScore(long contactId)251 private MatchScore getMatchingScore(long contactId) { 252 MatchScore matchingScore = mScores.get(contactId); 253 if (matchingScore == null) { 254 if (mScoreList.size() > mScoreCount) { 255 matchingScore = mScoreList.get(mScoreCount); 256 matchingScore.reset(contactId); 257 } else { 258 matchingScore = new MatchScore(contactId); 259 mScoreList.add(matchingScore); 260 } 261 mScoreCount++; 262 mScores.put(contactId, matchingScore); 263 } 264 return matchingScore; 265 } 266 267 /** 268 * Marks the contact as a full match, because we found an Identity match 269 */ matchIdentity(long contactId)270 public void matchIdentity(long contactId) { 271 updatePrimaryScore(contactId, MAX_SCORE); 272 } 273 274 /** 275 * Checks if there is a match and updates the overall score for the 276 * specified contact for a discovered match. The new score is determined 277 * by the prior score, by the type of name we were looking for, the type 278 * of name we found and, if the match is approximate, the distance between the candidate and 279 * actual name. 280 */ matchName(long contactId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)281 public void matchName(long contactId, int candidateNameType, String candidateName, 282 int nameType, String name, int algorithm) { 283 int maxScore = getMaxScore(candidateNameType, nameType); 284 if (maxScore == 0) { 285 return; 286 } 287 288 if (candidateName.equals(name)) { 289 updatePrimaryScore(contactId, maxScore); 290 return; 291 } 292 293 if (algorithm == MATCHING_ALGORITHM_EXACT) { 294 return; 295 } 296 297 int minScore = getMinScore(candidateNameType, nameType); 298 if (minScore == maxScore) { 299 return; 300 } 301 302 final byte[] decodedCandidateName; 303 final byte[] decodedName; 304 try { 305 decodedCandidateName = Hex.decodeHex(candidateName); 306 decodedName = Hex.decodeHex(name); 307 } catch (RuntimeException e) { 308 // How could this happen?? See bug 6827136 309 Log.e(TAG, "Failed to decode normalized name. Skipping.", e); 310 return; 311 } 312 313 NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? 314 mNameDistanceConservative : mNameDistanceApproximate; 315 316 int score; 317 float distance = nameDistance.getDistance(decodedCandidateName, decodedName); 318 boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME 319 || nameType == NameLookupType.EMAIL_BASED_NICKNAME; 320 float threshold = emailBased 321 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL 322 : APPROXIMATE_MATCH_THRESHOLD; 323 if (distance > threshold) { 324 score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); 325 } else { 326 score = 0; 327 } 328 329 updatePrimaryScore(contactId, score); 330 } 331 updateScoreWithPhoneNumberMatch(long contactId)332 public void updateScoreWithPhoneNumberMatch(long contactId) { 333 updateSecondaryScore(contactId, PHONE_MATCH_SCORE); 334 } 335 updateScoreWithEmailMatch(long contactId)336 public void updateScoreWithEmailMatch(long contactId) { 337 updateSecondaryScore(contactId, EMAIL_MATCH_SCORE); 338 } 339 updateScoreWithNicknameMatch(long contactId)340 public void updateScoreWithNicknameMatch(long contactId) { 341 updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE); 342 } 343 updatePrimaryScore(long contactId, int score)344 private void updatePrimaryScore(long contactId, int score) { 345 getMatchingScore(contactId).updatePrimaryScore(score); 346 } 347 updateSecondaryScore(long contactId, int score)348 private void updateSecondaryScore(long contactId, int score) { 349 getMatchingScore(contactId).updateSecondaryScore(score); 350 } 351 keepIn(long contactId)352 public void keepIn(long contactId) { 353 getMatchingScore(contactId).keepIn(); 354 } 355 keepOut(long contactId)356 public void keepOut(long contactId) { 357 getMatchingScore(contactId).keepOut(); 358 } 359 clear()360 public void clear() { 361 mScores.clear(); 362 mScoreCount = 0; 363 } 364 365 /** 366 * Returns a list of IDs for contacts that are matched on secondary data elements 367 * (phone number, email address, nickname). We still need to obtain the approximate 368 * primary score for those contacts to determine if any of them should be aggregated. 369 * <p> 370 * May return null. 371 */ prepareSecondaryMatchCandidates(int threshold)372 public List<Long> prepareSecondaryMatchCandidates(int threshold) { 373 ArrayList<Long> contactIds = null; 374 375 for (int i = 0; i < mScoreCount; i++) { 376 MatchScore score = mScoreList.get(i); 377 if (score.mKeepOut) { 378 continue; 379 } 380 381 int s = score.mSecondaryScore; 382 if (s >= threshold) { 383 if (contactIds == null) { 384 contactIds = new ArrayList<Long>(); 385 } 386 contactIds.add(score.mContactId); 387 } 388 score.mPrimaryScore = NO_DATA_SCORE; 389 } 390 return contactIds; 391 } 392 393 /** 394 * Returns the contactId with the best match score over the specified threshold or -1 395 * if no such contact is found. If multiple contacts are found, and 396 * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if 397 * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}. 398 */ pickBestMatch(int threshold, boolean allowMultipleMatches)399 public long pickBestMatch(int threshold, boolean allowMultipleMatches) { 400 long contactId = -1; 401 int maxScore = 0; 402 for (int i = 0; i < mScoreCount; i++) { 403 MatchScore score = mScoreList.get(i); 404 if (score.mKeepOut) { 405 continue; 406 } 407 408 if (score.mKeepIn) { 409 return score.mContactId; 410 } 411 412 int s = score.mPrimaryScore; 413 if (s == NO_DATA_SCORE) { 414 s = score.mSecondaryScore; 415 } 416 417 if (s >= threshold) { 418 if (contactId != -1 && !allowMultipleMatches) { 419 return MULTIPLE_MATCHES; 420 } 421 // In order to make it stable, let's jut pick the one with the lowest ID 422 // if multiple candidates are found. 423 if ((s > maxScore) || ((s == maxScore) && (contactId > score.mContactId))) { 424 contactId = score.mContactId; 425 maxScore = s; 426 } 427 } 428 } 429 return contactId; 430 } 431 432 /** 433 * Returns matches in the order of descending score. 434 */ pickBestMatches(int threshold)435 public List<MatchScore> pickBestMatches(int threshold) { 436 int scaledThreshold = threshold * SCORE_SCALE; 437 List<MatchScore> matches = mScoreList.subList(0, mScoreCount); 438 Collections.sort(matches); 439 int count = 0; 440 for (int i = 0; i < mScoreCount; i++) { 441 MatchScore matchScore = matches.get(i); 442 if (matchScore.getScore() >= scaledThreshold) { 443 count++; 444 } else { 445 break; 446 } 447 } 448 449 return matches.subList(0, count); 450 } 451 452 @Override toString()453 public String toString() { 454 return mScoreList.subList(0, mScoreCount).toString(); 455 } 456 } 457