1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts.aggregation.util; 17 18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; 19 import com.android.providers.contacts.util.Hex; 20 21 import java.util.ArrayList; 22 import java.util.Collections; 23 import java.util.HashMap; 24 import java.util.List; 25 26 /** 27 * Logic for matching contacts' data and accumulating match scores. 28 */ 29 public class ContactMatcher { 30 31 // Best possible match score 32 public static final int MAX_SCORE = 100; 33 34 // Suggest to aggregate contacts if their match score is equal or greater than this threshold 35 public static final int SCORE_THRESHOLD_SUGGEST = 50; 36 37 // Automatically aggregate contacts if their match score is equal or greater than this threshold 38 public static final int SCORE_THRESHOLD_PRIMARY = 70; 39 40 // Automatically aggregate contacts if the match score is equal or greater than this threshold 41 // and there is a secondary match (phone number, email etc). 42 public static final int SCORE_THRESHOLD_SECONDARY = 50; 43 44 // Score for missing data (as opposed to present data but a bad match) 45 private static final int NO_DATA_SCORE = -1; 46 47 // Score for matching phone numbers 48 private static final int PHONE_MATCH_SCORE = 71; 49 50 // Score for matching email addresses 51 private static final int EMAIL_MATCH_SCORE = 71; 52 53 // Score for matching nickname 54 private static final int NICKNAME_MATCH_SCORE = 71; 55 56 // Maximum number of characters in a name to be considered by the matching algorithm. 57 private static final int MAX_MATCHED_NAME_LENGTH = 30; 58 59 // Scores a multiplied by this number to allow room for "fractional" scores 60 private static final int SCORE_SCALE = 1000; 61 62 public static final int MATCHING_ALGORITHM_EXACT = 0; 63 public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; 64 public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; 65 66 // Minimum edit distance between two names to be considered an approximate match 67 public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; 68 69 // Minimum edit distance between two email ids to be considered an approximate match 70 public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; 71 72 // Returned value when we found multiple matches and that was not allowed 73 public static final long MULTIPLE_MATCHES = -2; 74 75 /** 76 * Name matching scores: a matrix by name type vs. candidate lookup type. 77 * For example, if the name type is "full name" while we are looking for a 78 * "full name", the score may be 99. If we are looking for a "nickname" but 79 * find "first name", the score may be 50 (see specific scores defined 80 * below.) 81 * <p> 82 * For approximate matching, we have a range of scores, let's say 40-70. Depending one how 83 * similar the two strings are, the score will be somewhere between 40 and 70, with the exact 84 * match producing the score of 70. The score may also be 0 if the similarity (distance) 85 * between the strings is below the threshold. 86 * <p> 87 * We use a string matching algorithm, which is particularly suited for 88 * name matching. See {@link NameDistance}. 89 */ 90 private static int[] sMinScore = 91 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 92 private static int[] sMaxScore = 93 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 94 95 /* 96 * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, 97 * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are 98 * not! They are useful in three-way aggregation cases when we have, for example, both 99 * John Smith and Smith John. A third contact with the name John Smith should be aggregated 100 * with the former rather than the latter. This is why "reverse" matches have slightly lower 101 * scores than direct matches. 102 */ 103 static { setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)104 setScoreRange(NameLookupType.NAME_EXACT, 105 NameLookupType.NAME_EXACT, 99, 99); setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)106 setScoreRange(NameLookupType.NAME_VARIANT, 107 NameLookupType.NAME_VARIANT, 90, 90); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)108 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 109 NameLookupType.NAME_COLLATION_KEY, 50, 80); 110 setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)111 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 112 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)113 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 114 NameLookupType.NICKNAME, 50, 60); 115 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)116 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 117 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)118 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 119 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)120 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 121 NameLookupType.NICKNAME, 50, 60); 122 setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)123 setScoreRange(NameLookupType.NICKNAME, 124 NameLookupType.NICKNAME, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)125 setScoreRange(NameLookupType.NICKNAME, 126 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)127 setScoreRange(NameLookupType.NICKNAME, 128 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 129 } 130 131 /** 132 * Populates the cells of the score matrix and score span matrix 133 * corresponding to the {@code candidateNameType} and {@code nameType}. 134 */ setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)135 private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) { 136 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 137 sMinScore[index] = scoreFrom; 138 sMaxScore[index] = scoreTo; 139 } 140 141 /** 142 * Returns the lower range for the match score for the given {@code candidateNameType} and 143 * {@code nameType}. 144 */ getMinScore(int candidateNameType, int nameType)145 private static int getMinScore(int candidateNameType, int nameType) { 146 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 147 return sMinScore[index]; 148 } 149 150 /** 151 * Returns the upper range for the match score for the given {@code candidateNameType} and 152 * {@code nameType}. 153 */ getMaxScore(int candidateNameType, int nameType)154 private static int getMaxScore(int candidateNameType, int nameType) { 155 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 156 return sMaxScore[index]; 157 } 158 159 /** 160 * Captures the max score and match count for a specific contact. Used in an 161 * contactId - MatchScore map. 162 */ 163 public static class MatchScore implements Comparable<MatchScore> { 164 private long mContactId; 165 private boolean mKeepIn; 166 private boolean mKeepOut; 167 private int mPrimaryScore; 168 private int mSecondaryScore; 169 private int mMatchCount; 170 MatchScore(long contactId)171 public MatchScore(long contactId) { 172 this.mContactId = contactId; 173 } 174 reset(long contactId)175 public void reset(long contactId) { 176 this.mContactId = contactId; 177 mKeepIn = false; 178 mKeepOut = false; 179 mPrimaryScore = 0; 180 mSecondaryScore = 0; 181 mMatchCount = 0; 182 } 183 getContactId()184 public long getContactId() { 185 return mContactId; 186 } 187 updatePrimaryScore(int score)188 public void updatePrimaryScore(int score) { 189 if (score > mPrimaryScore) { 190 mPrimaryScore = score; 191 } 192 mMatchCount++; 193 } 194 updateSecondaryScore(int score)195 public void updateSecondaryScore(int score) { 196 if (score > mSecondaryScore) { 197 mSecondaryScore = score; 198 } 199 mMatchCount++; 200 } 201 keepIn()202 public void keepIn() { 203 mKeepIn = true; 204 } 205 keepOut()206 public void keepOut() { 207 mKeepOut = true; 208 } 209 getScore()210 public int getScore() { 211 if (mKeepOut) { 212 return 0; 213 } 214 215 if (mKeepIn) { 216 return MAX_SCORE; 217 } 218 219 int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore); 220 221 // Ensure that of two contacts with the same match score the one with more matching 222 // data elements wins. 223 return score * SCORE_SCALE + mMatchCount; 224 } 225 226 /** 227 * Descending order of match score. 228 */ 229 @Override compareTo(MatchScore another)230 public int compareTo(MatchScore another) { 231 return another.getScore() - getScore(); 232 } 233 234 @Override toString()235 public String toString() { 236 return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount 237 + ")"; 238 } 239 } 240 241 private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>(); 242 private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); 243 private int mScoreCount = 0; 244 245 private final NameDistance mNameDistanceConservative = new NameDistance(); 246 private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); 247 getMatchingScore(long contactId)248 private MatchScore getMatchingScore(long contactId) { 249 MatchScore matchingScore = mScores.get(contactId); 250 if (matchingScore == null) { 251 if (mScoreList.size() > mScoreCount) { 252 matchingScore = mScoreList.get(mScoreCount); 253 matchingScore.reset(contactId); 254 } else { 255 matchingScore = new MatchScore(contactId); 256 mScoreList.add(matchingScore); 257 } 258 mScoreCount++; 259 mScores.put(contactId, matchingScore); 260 } 261 return matchingScore; 262 } 263 264 /** 265 * Marks the contact as a full match, because we found an Identity match 266 */ matchIdentity(long contactId)267 public void matchIdentity(long contactId) { 268 updatePrimaryScore(contactId, MAX_SCORE); 269 } 270 271 /** 272 * Checks if there is a match and updates the overall score for the 273 * specified contact for a discovered match. The new score is determined 274 * by the prior score, by the type of name we were looking for, the type 275 * of name we found and, if the match is approximate, the distance between the candidate and 276 * actual name. 277 */ matchName(long contactId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)278 public void matchName(long contactId, int candidateNameType, String candidateName, 279 int nameType, String name, int algorithm) { 280 int maxScore = getMaxScore(candidateNameType, nameType); 281 if (maxScore == 0) { 282 return; 283 } 284 285 if (candidateName.equals(name)) { 286 updatePrimaryScore(contactId, maxScore); 287 return; 288 } 289 290 if (algorithm == MATCHING_ALGORITHM_EXACT) { 291 return; 292 } 293 294 int minScore = getMinScore(candidateNameType, nameType); 295 if (minScore == maxScore) { 296 return; 297 } 298 299 byte[] decodedCandidateName = Hex.decodeHex(candidateName); 300 byte[] decodedName = Hex.decodeHex(name); 301 302 NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? 303 mNameDistanceConservative : mNameDistanceApproximate; 304 305 int score; 306 float distance = nameDistance.getDistance(decodedCandidateName, decodedName); 307 boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME 308 || nameType == NameLookupType.EMAIL_BASED_NICKNAME; 309 float threshold = emailBased 310 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL 311 : APPROXIMATE_MATCH_THRESHOLD; 312 if (distance > threshold) { 313 score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); 314 } else { 315 score = 0; 316 } 317 318 updatePrimaryScore(contactId, score); 319 } 320 updateScoreWithPhoneNumberMatch(long contactId)321 public void updateScoreWithPhoneNumberMatch(long contactId) { 322 updateSecondaryScore(contactId, PHONE_MATCH_SCORE); 323 } 324 updateScoreWithEmailMatch(long contactId)325 public void updateScoreWithEmailMatch(long contactId) { 326 updateSecondaryScore(contactId, EMAIL_MATCH_SCORE); 327 } 328 updateScoreWithNicknameMatch(long contactId)329 public void updateScoreWithNicknameMatch(long contactId) { 330 updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE); 331 } 332 updatePrimaryScore(long contactId, int score)333 private void updatePrimaryScore(long contactId, int score) { 334 getMatchingScore(contactId).updatePrimaryScore(score); 335 } 336 updateSecondaryScore(long contactId, int score)337 private void updateSecondaryScore(long contactId, int score) { 338 getMatchingScore(contactId).updateSecondaryScore(score); 339 } 340 keepIn(long contactId)341 public void keepIn(long contactId) { 342 getMatchingScore(contactId).keepIn(); 343 } 344 keepOut(long contactId)345 public void keepOut(long contactId) { 346 getMatchingScore(contactId).keepOut(); 347 } 348 clear()349 public void clear() { 350 mScores.clear(); 351 mScoreCount = 0; 352 } 353 354 /** 355 * Returns a list of IDs for contacts that are matched on secondary data elements 356 * (phone number, email address, nickname). We still need to obtain the approximate 357 * primary score for those contacts to determine if any of them should be aggregated. 358 * <p> 359 * May return null. 360 */ prepareSecondaryMatchCandidates(int threshold)361 public List<Long> prepareSecondaryMatchCandidates(int threshold) { 362 ArrayList<Long> contactIds = null; 363 364 for (int i = 0; i < mScoreCount; i++) { 365 MatchScore score = mScoreList.get(i); 366 if (score.mKeepOut) { 367 continue; 368 } 369 370 int s = score.mSecondaryScore; 371 if (s >= threshold) { 372 if (contactIds == null) { 373 contactIds = new ArrayList<Long>(); 374 } 375 contactIds.add(score.mContactId); 376 } 377 score.mPrimaryScore = NO_DATA_SCORE; 378 } 379 return contactIds; 380 } 381 382 /** 383 * Returns the contactId with the best match score over the specified threshold or -1 384 * if no such contact is found. If multiple contacts are found, and 385 * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if 386 * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}. 387 */ pickBestMatch(int threshold, boolean allowMultipleMatches)388 public long pickBestMatch(int threshold, boolean allowMultipleMatches) { 389 long contactId = -1; 390 int maxScore = 0; 391 for (int i = 0; i < mScoreCount; i++) { 392 MatchScore score = mScoreList.get(i); 393 if (score.mKeepOut) { 394 continue; 395 } 396 397 if (score.mKeepIn) { 398 return score.mContactId; 399 } 400 401 int s = score.mPrimaryScore; 402 if (s == NO_DATA_SCORE) { 403 s = score.mSecondaryScore; 404 } 405 406 if (s >= threshold) { 407 if (contactId != -1 && !allowMultipleMatches) { 408 return MULTIPLE_MATCHES; 409 } 410 // In order to make it stable, let's jut pick the one with the lowest ID 411 // if multiple candidates are found. 412 if ((s > maxScore) || ((s == maxScore) && (contactId > score.mContactId))) { 413 contactId = score.mContactId; 414 maxScore = s; 415 } 416 } 417 } 418 return contactId; 419 } 420 421 /** 422 * Returns matches in the order of descending score. 423 */ pickBestMatches(int threshold)424 public List<MatchScore> pickBestMatches(int threshold) { 425 int scaledThreshold = threshold * SCORE_SCALE; 426 List<MatchScore> matches = mScoreList.subList(0, mScoreCount); 427 Collections.sort(matches); 428 int count = 0; 429 for (int i = 0; i < mScoreCount; i++) { 430 MatchScore matchScore = matches.get(i); 431 if (matchScore.getScore() >= scaledThreshold) { 432 count++; 433 } else { 434 break; 435 } 436 } 437 438 return matches.subList(0, count); 439 } 440 441 @Override toString()442 public String toString() { 443 return mScoreList.subList(0, mScoreCount).toString(); 444 } 445 } 446