• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts.aggregation.util;
17 
18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
19 import com.android.providers.contacts.util.Hex;
20 
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.HashMap;
24 import java.util.List;
25 
26 /**
27  * Logic for matching contacts' data and accumulating match scores.
28  */
29 public class ContactMatcher {
30 
31     // Best possible match score
32     public static final int MAX_SCORE = 100;
33 
34     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
35     public static final int SCORE_THRESHOLD_SUGGEST = 50;
36 
37     // Automatically aggregate contacts if their match score is equal or greater than this threshold
38     public static final int SCORE_THRESHOLD_PRIMARY = 70;
39 
40     // Automatically aggregate contacts if the match score is equal or greater than this threshold
41     // and there is a secondary match (phone number, email etc).
42     public static final int SCORE_THRESHOLD_SECONDARY = 50;
43 
44     // Score for missing data (as opposed to present data but a bad match)
45     private static final int NO_DATA_SCORE = -1;
46 
47     // Score for matching phone numbers
48     private static final int PHONE_MATCH_SCORE = 71;
49 
50     // Score for matching email addresses
51     private static final int EMAIL_MATCH_SCORE = 71;
52 
53     // Score for matching nickname
54     private static final int NICKNAME_MATCH_SCORE = 71;
55 
56     // Maximum number of characters in a name to be considered by the matching algorithm.
57     private static final int MAX_MATCHED_NAME_LENGTH = 30;
58 
59     // Scores a multiplied by this number to allow room for "fractional" scores
60     private static final int SCORE_SCALE = 1000;
61 
62     public static final int MATCHING_ALGORITHM_EXACT = 0;
63     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
64     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
65 
66     // Minimum edit distance between two names to be considered an approximate match
67     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
68 
69     // Minimum edit distance between two email ids to be considered an approximate match
70     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
71 
72     // Returned value when we found multiple matches and that was not allowed
73     public static final long MULTIPLE_MATCHES = -2;
74 
75     /**
76      * Name matching scores: a matrix by name type vs. candidate lookup type.
77      * For example, if the name type is "full name" while we are looking for a
78      * "full name", the score may be 99. If we are looking for a "nickname" but
79      * find "first name", the score may be 50 (see specific scores defined
80      * below.)
81      * <p>
82      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
83      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
84      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
85      * between the strings is below the threshold.
86      * <p>
87      * We use a string matching algorithm, which is particularly suited for
88      * name matching. See {@link NameDistance}.
89      */
90     private static int[] sMinScore =
91             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
92     private static int[] sMaxScore =
93             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
94 
95     /*
96      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
97      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
98      * not!  They are useful in three-way aggregation cases when we have, for example, both
99      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
100      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
101      * scores than direct matches.
102      */
103     static {
setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)104         setScoreRange(NameLookupType.NAME_EXACT,
105                 NameLookupType.NAME_EXACT, 99, 99);
setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)106         setScoreRange(NameLookupType.NAME_VARIANT,
107                 NameLookupType.NAME_VARIANT, 90, 90);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)108         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
109                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
110 
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)111         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
112                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)113         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
114                 NameLookupType.NICKNAME, 50, 60);
115 
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)116         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
117                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)118         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
119                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)120         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
121                 NameLookupType.NICKNAME, 50, 60);
122 
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)123         setScoreRange(NameLookupType.NICKNAME,
124                 NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)125         setScoreRange(NameLookupType.NICKNAME,
126                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)127         setScoreRange(NameLookupType.NICKNAME,
128                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
129     }
130 
131     /**
132      * Populates the cells of the score matrix and score span matrix
133      * corresponding to the {@code candidateNameType} and {@code nameType}.
134      */
setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)135     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
136         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
137         sMinScore[index] = scoreFrom;
138         sMaxScore[index] = scoreTo;
139     }
140 
141     /**
142      * Returns the lower range for the match score for the given {@code candidateNameType} and
143      * {@code nameType}.
144      */
getMinScore(int candidateNameType, int nameType)145     private static int getMinScore(int candidateNameType, int nameType) {
146         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
147         return sMinScore[index];
148     }
149 
150     /**
151      * Returns the upper range for the match score for the given {@code candidateNameType} and
152      * {@code nameType}.
153      */
getMaxScore(int candidateNameType, int nameType)154     private static int getMaxScore(int candidateNameType, int nameType) {
155         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
156         return sMaxScore[index];
157     }
158 
159     /**
160      * Captures the max score and match count for a specific contact.  Used in an
161      * contactId - MatchScore map.
162      */
163     public static class MatchScore implements Comparable<MatchScore> {
164         private long mContactId;
165         private boolean mKeepIn;
166         private boolean mKeepOut;
167         private int mPrimaryScore;
168         private int mSecondaryScore;
169         private int mMatchCount;
170 
MatchScore(long contactId)171         public MatchScore(long contactId) {
172             this.mContactId = contactId;
173         }
174 
reset(long contactId)175         public void reset(long contactId) {
176             this.mContactId = contactId;
177             mKeepIn = false;
178             mKeepOut = false;
179             mPrimaryScore = 0;
180             mSecondaryScore = 0;
181             mMatchCount = 0;
182         }
183 
getContactId()184         public long getContactId() {
185             return mContactId;
186         }
187 
updatePrimaryScore(int score)188         public void updatePrimaryScore(int score) {
189             if (score > mPrimaryScore) {
190                 mPrimaryScore = score;
191             }
192             mMatchCount++;
193         }
194 
updateSecondaryScore(int score)195         public void updateSecondaryScore(int score) {
196             if (score > mSecondaryScore) {
197                 mSecondaryScore = score;
198             }
199             mMatchCount++;
200         }
201 
keepIn()202         public void keepIn() {
203             mKeepIn = true;
204         }
205 
keepOut()206         public void keepOut() {
207             mKeepOut = true;
208         }
209 
getScore()210         public int getScore() {
211             if (mKeepOut) {
212                 return 0;
213             }
214 
215             if (mKeepIn) {
216                 return MAX_SCORE;
217             }
218 
219             int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
220 
221             // Ensure that of two contacts with the same match score the one with more matching
222             // data elements wins.
223             return score * SCORE_SCALE + mMatchCount;
224         }
225 
226         /**
227          * Descending order of match score.
228          */
229         @Override
compareTo(MatchScore another)230         public int compareTo(MatchScore another) {
231             return another.getScore() - getScore();
232         }
233 
234         @Override
toString()235         public String toString() {
236             return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
237                     + ")";
238         }
239     }
240 
241     private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
242     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
243     private int mScoreCount = 0;
244 
245     private final NameDistance mNameDistanceConservative = new NameDistance();
246     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
247 
getMatchingScore(long contactId)248     private MatchScore getMatchingScore(long contactId) {
249         MatchScore matchingScore = mScores.get(contactId);
250         if (matchingScore == null) {
251             if (mScoreList.size() > mScoreCount) {
252                 matchingScore = mScoreList.get(mScoreCount);
253                 matchingScore.reset(contactId);
254             } else {
255                 matchingScore = new MatchScore(contactId);
256                 mScoreList.add(matchingScore);
257             }
258             mScoreCount++;
259             mScores.put(contactId, matchingScore);
260         }
261         return matchingScore;
262     }
263 
264     /**
265      * Marks the contact as a full match, because we found an Identity match
266      */
matchIdentity(long contactId)267     public void matchIdentity(long contactId) {
268         updatePrimaryScore(contactId, MAX_SCORE);
269     }
270 
271     /**
272      * Checks if there is a match and updates the overall score for the
273      * specified contact for a discovered match. The new score is determined
274      * by the prior score, by the type of name we were looking for, the type
275      * of name we found and, if the match is approximate, the distance between the candidate and
276      * actual name.
277      */
matchName(long contactId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)278     public void matchName(long contactId, int candidateNameType, String candidateName,
279             int nameType, String name, int algorithm) {
280         int maxScore = getMaxScore(candidateNameType, nameType);
281         if (maxScore == 0) {
282             return;
283         }
284 
285         if (candidateName.equals(name)) {
286             updatePrimaryScore(contactId, maxScore);
287             return;
288         }
289 
290         if (algorithm == MATCHING_ALGORITHM_EXACT) {
291             return;
292         }
293 
294         int minScore = getMinScore(candidateNameType, nameType);
295         if (minScore == maxScore) {
296             return;
297         }
298 
299         byte[] decodedCandidateName = Hex.decodeHex(candidateName);
300         byte[] decodedName = Hex.decodeHex(name);
301 
302         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
303                 mNameDistanceConservative : mNameDistanceApproximate;
304 
305         int score;
306         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
307         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
308                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
309         float threshold = emailBased
310                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
311                 : APPROXIMATE_MATCH_THRESHOLD;
312         if (distance > threshold) {
313             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
314         } else {
315             score = 0;
316         }
317 
318         updatePrimaryScore(contactId, score);
319     }
320 
updateScoreWithPhoneNumberMatch(long contactId)321     public void updateScoreWithPhoneNumberMatch(long contactId) {
322         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
323     }
324 
updateScoreWithEmailMatch(long contactId)325     public void updateScoreWithEmailMatch(long contactId) {
326         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
327     }
328 
updateScoreWithNicknameMatch(long contactId)329     public void updateScoreWithNicknameMatch(long contactId) {
330         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
331     }
332 
updatePrimaryScore(long contactId, int score)333     private void updatePrimaryScore(long contactId, int score) {
334         getMatchingScore(contactId).updatePrimaryScore(score);
335     }
336 
updateSecondaryScore(long contactId, int score)337     private void updateSecondaryScore(long contactId, int score) {
338         getMatchingScore(contactId).updateSecondaryScore(score);
339     }
340 
keepIn(long contactId)341     public void keepIn(long contactId) {
342         getMatchingScore(contactId).keepIn();
343     }
344 
keepOut(long contactId)345     public void keepOut(long contactId) {
346         getMatchingScore(contactId).keepOut();
347     }
348 
clear()349     public void clear() {
350         mScores.clear();
351         mScoreCount = 0;
352     }
353 
354     /**
355      * Returns a list of IDs for contacts that are matched on secondary data elements
356      * (phone number, email address, nickname). We still need to obtain the approximate
357      * primary score for those contacts to determine if any of them should be aggregated.
358      * <p>
359      * May return null.
360      */
prepareSecondaryMatchCandidates(int threshold)361     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
362         ArrayList<Long> contactIds = null;
363 
364         for (int i = 0; i < mScoreCount; i++) {
365             MatchScore score = mScoreList.get(i);
366             if (score.mKeepOut) {
367                 continue;
368             }
369 
370             int s = score.mSecondaryScore;
371             if (s >= threshold) {
372                 if (contactIds == null) {
373                     contactIds = new ArrayList<Long>();
374                 }
375                 contactIds.add(score.mContactId);
376             }
377             score.mPrimaryScore = NO_DATA_SCORE;
378         }
379         return contactIds;
380     }
381 
382     /**
383      * Returns the contactId with the best match score over the specified threshold or -1
384      * if no such contact is found.  If multiple contacts are found, and
385      * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if
386      * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}.
387      */
pickBestMatch(int threshold, boolean allowMultipleMatches)388     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
389         long contactId = -1;
390         int maxScore = 0;
391         for (int i = 0; i < mScoreCount; i++) {
392             MatchScore score = mScoreList.get(i);
393             if (score.mKeepOut) {
394                 continue;
395             }
396 
397             if (score.mKeepIn) {
398                 return score.mContactId;
399             }
400 
401             int s = score.mPrimaryScore;
402             if (s == NO_DATA_SCORE) {
403                 s = score.mSecondaryScore;
404             }
405 
406             if (s >= threshold) {
407                 if (contactId != -1 && !allowMultipleMatches) {
408                     return MULTIPLE_MATCHES;
409                 }
410                 // In order to make it stable, let's jut pick the one with the lowest ID
411                 // if multiple candidates are found.
412                 if ((s > maxScore) || ((s == maxScore) && (contactId > score.mContactId))) {
413                     contactId = score.mContactId;
414                     maxScore = s;
415                 }
416             }
417         }
418         return contactId;
419     }
420 
421     /**
422      * Returns matches in the order of descending score.
423      */
pickBestMatches(int threshold)424     public List<MatchScore> pickBestMatches(int threshold) {
425         int scaledThreshold = threshold * SCORE_SCALE;
426         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
427         Collections.sort(matches);
428         int count = 0;
429         for (int i = 0; i < mScoreCount; i++) {
430             MatchScore matchScore = matches.get(i);
431             if (matchScore.getScore() >= scaledThreshold) {
432                 count++;
433             } else {
434                 break;
435             }
436         }
437 
438         return matches.subList(0, count);
439     }
440 
441     @Override
toString()442     public String toString() {
443         return mScoreList.subList(0, mScoreCount).toString();
444     }
445 }
446