1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts.aggregation.util; 17 18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; 19 import com.android.providers.contacts.util.Hex; 20 21 import android.util.ArrayMap; 22 import android.util.Log; 23 24 import java.util.ArrayList; 25 import java.util.Collections; 26 import java.util.List; 27 28 /** 29 * Logic for matching contacts' data and accumulating match scores. 30 */ 31 public class ContactMatcher { 32 private static final String TAG = "ContactMatcher"; 33 34 // Suggest to aggregate contacts if their match score is equal or greater than this threshold 35 public static final int SCORE_THRESHOLD_SUGGEST = 50; 36 37 // Automatically aggregate contacts if their match score is equal or greater than this threshold 38 public static final int SCORE_THRESHOLD_PRIMARY = 70; 39 40 // Automatically aggregate contacts if the match score is equal or greater than this threshold 41 // and there is a secondary match (phone number, email etc). 42 public static final int SCORE_THRESHOLD_SECONDARY = 50; 43 44 // Score for missing data (as opposed to present data but a bad match) 45 private static final int NO_DATA_SCORE = -1; 46 47 // Score for matching phone numbers 48 private static final int PHONE_MATCH_SCORE = 71; 49 50 // Score for matching email addresses 51 private static final int EMAIL_MATCH_SCORE = 71; 52 53 // Score for matching nickname 54 private static final int NICKNAME_MATCH_SCORE = 71; 55 56 // Maximum number of characters in a name to be considered by the matching algorithm. 57 private static final int MAX_MATCHED_NAME_LENGTH = 30; 58 59 public static final int MATCHING_ALGORITHM_EXACT = 0; 60 public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; 61 public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; 62 63 // Minimum edit distance between two names to be considered an approximate match 64 public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; 65 66 // Minimum edit distance between two email ids to be considered an approximate match 67 public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; 68 69 // Returned value when we found multiple matches and that was not allowed 70 public static final long MULTIPLE_MATCHES = -2; 71 72 /** 73 * Name matching scores: a matrix by name type vs. candidate lookup type. 74 * For example, if the name type is "full name" while we are looking for a 75 * "full name", the score may be 99. If we are looking for a "nickname" but 76 * find "first name", the score may be 50 (see specific scores defined 77 * below.) 78 * <p> 79 * For approximate matching, we have a range of scores, let's say 40-70. Depending one how 80 * similar the two strings are, the score will be somewhere between 40 and 70, with the exact 81 * match producing the score of 70. The score may also be 0 if the similarity (distance) 82 * between the strings is below the threshold. 83 * <p> 84 * We use a string matching algorithm, which is particularly suited for 85 * name matching. See {@link NameDistance}. 86 */ 87 private static int[] sMinScore = 88 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 89 private static int[] sMaxScore = 90 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 91 92 /* 93 * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, 94 * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are 95 * not! They are useful in three-way aggregation cases when we have, for example, both 96 * John Smith and Smith John. A third contact with the name John Smith should be aggregated 97 * with the former rather than the latter. This is why "reverse" matches have slightly lower 98 * scores than direct matches. 99 */ 100 static { setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)101 setScoreRange(NameLookupType.NAME_EXACT, 102 NameLookupType.NAME_EXACT, 99, 99); setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)103 setScoreRange(NameLookupType.NAME_VARIANT, 104 NameLookupType.NAME_VARIANT, 90, 90); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)105 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 106 NameLookupType.NAME_COLLATION_KEY, 50, 80); 107 setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)108 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 109 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)110 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 111 NameLookupType.NICKNAME, 50, 60); 112 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)113 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 114 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)115 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 116 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)117 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 118 NameLookupType.NICKNAME, 50, 60); 119 setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)120 setScoreRange(NameLookupType.NICKNAME, 121 NameLookupType.NICKNAME, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)122 setScoreRange(NameLookupType.NICKNAME, 123 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)124 setScoreRange(NameLookupType.NICKNAME, 125 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 126 } 127 128 /** 129 * Populates the cells of the score matrix and score span matrix 130 * corresponding to the {@code candidateNameType} and {@code nameType}. 131 */ setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)132 private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) { 133 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 134 sMinScore[index] = scoreFrom; 135 sMaxScore[index] = scoreTo; 136 } 137 138 /** 139 * Returns the lower range for the match score for the given {@code candidateNameType} and 140 * {@code nameType}. 141 */ getMinScore(int candidateNameType, int nameType)142 private static int getMinScore(int candidateNameType, int nameType) { 143 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 144 return sMinScore[index]; 145 } 146 147 /** 148 * Returns the upper range for the match score for the given {@code candidateNameType} and 149 * {@code nameType}. 150 */ getMaxScore(int candidateNameType, int nameType)151 private static int getMaxScore(int candidateNameType, int nameType) { 152 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 153 return sMaxScore[index]; 154 } 155 156 private final ArrayMap<Long, MatchScore> mScores = new ArrayMap<>(); 157 private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); 158 private int mScoreCount = 0; 159 160 private final NameDistance mNameDistanceConservative = new NameDistance(); 161 private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); 162 getMatchingScore(long contactId)163 private MatchScore getMatchingScore(long contactId) { 164 MatchScore matchingScore = mScores.get(contactId); 165 if (matchingScore == null) { 166 if (mScoreList.size() > mScoreCount) { 167 matchingScore = mScoreList.get(mScoreCount); 168 matchingScore.reset(contactId); 169 } else { 170 matchingScore = new MatchScore(contactId); 171 mScoreList.add(matchingScore); 172 } 173 mScoreCount++; 174 mScores.put(contactId, matchingScore); 175 } 176 return matchingScore; 177 } 178 179 /** 180 * Marks the contact as a full match, because we found an Identity match 181 */ matchIdentity(long contactId)182 public void matchIdentity(long contactId) { 183 updatePrimaryScore(contactId, MatchScore.MAX_SCORE); 184 } 185 186 /** 187 * Checks if there is a match and updates the overall score for the 188 * specified contact for a discovered match. The new score is determined 189 * by the prior score, by the type of name we were looking for, the type 190 * of name we found and, if the match is approximate, the distance between the candidate and 191 * actual name. 192 */ matchName(long contactId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)193 public void matchName(long contactId, int candidateNameType, String candidateName, 194 int nameType, String name, int algorithm) { 195 int maxScore = getMaxScore(candidateNameType, nameType); 196 if (maxScore == 0) { 197 return; 198 } 199 200 if (candidateName.equals(name)) { 201 updatePrimaryScore(contactId, maxScore); 202 return; 203 } 204 205 if (algorithm == MATCHING_ALGORITHM_EXACT) { 206 return; 207 } 208 209 int minScore = getMinScore(candidateNameType, nameType); 210 if (minScore == maxScore) { 211 return; 212 } 213 214 final byte[] decodedCandidateName; 215 final byte[] decodedName; 216 try { 217 decodedCandidateName = Hex.decodeHex(candidateName); 218 decodedName = Hex.decodeHex(name); 219 } catch (RuntimeException e) { 220 // How could this happen?? See bug 6827136 221 Log.e(TAG, "Failed to decode normalized name. Skipping.", e); 222 return; 223 } 224 225 NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? 226 mNameDistanceConservative : mNameDistanceApproximate; 227 228 int score; 229 float distance = nameDistance.getDistance(decodedCandidateName, decodedName); 230 boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME 231 || nameType == NameLookupType.EMAIL_BASED_NICKNAME; 232 float threshold = emailBased 233 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL 234 : APPROXIMATE_MATCH_THRESHOLD; 235 if (distance > threshold) { 236 score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); 237 } else { 238 score = 0; 239 } 240 241 updatePrimaryScore(contactId, score); 242 } 243 updateScoreWithPhoneNumberMatch(long contactId)244 public void updateScoreWithPhoneNumberMatch(long contactId) { 245 updateSecondaryScore(contactId, PHONE_MATCH_SCORE); 246 } 247 updateScoreWithEmailMatch(long contactId)248 public void updateScoreWithEmailMatch(long contactId) { 249 updateSecondaryScore(contactId, EMAIL_MATCH_SCORE); 250 } 251 updateScoreWithNicknameMatch(long contactId)252 public void updateScoreWithNicknameMatch(long contactId) { 253 updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE); 254 } 255 updatePrimaryScore(long contactId, int score)256 private void updatePrimaryScore(long contactId, int score) { 257 getMatchingScore(contactId).updatePrimaryScore(score); 258 } 259 updateSecondaryScore(long contactId, int score)260 private void updateSecondaryScore(long contactId, int score) { 261 getMatchingScore(contactId).updateSecondaryScore(score); 262 } 263 keepIn(long contactId)264 public void keepIn(long contactId) { 265 getMatchingScore(contactId).keepIn(); 266 } 267 keepOut(long contactId)268 public void keepOut(long contactId) { 269 getMatchingScore(contactId).keepOut(); 270 } 271 clear()272 public void clear() { 273 mScores.clear(); 274 mScoreCount = 0; 275 } 276 277 /** 278 * Returns a list of IDs for contacts that are matched on secondary data elements 279 * (phone number, email address, nickname). We still need to obtain the approximate 280 * primary score for those contacts to determine if any of them should be aggregated. 281 * <p> 282 * May return null. 283 */ prepareSecondaryMatchCandidates(int threshold)284 public List<Long> prepareSecondaryMatchCandidates(int threshold) { 285 ArrayList<Long> contactIds = null; 286 287 for (int i = 0; i < mScoreCount; i++) { 288 MatchScore score = mScoreList.get(i); 289 if (score.isKeepOut()) { 290 continue; 291 } 292 293 int s = score.getSecondaryScore(); 294 if (s >= threshold) { 295 if (contactIds == null) { 296 contactIds = new ArrayList<Long>(); 297 } 298 contactIds.add(score.getContactId()); 299 } 300 score.setPrimaryScore(NO_DATA_SCORE); 301 } 302 return contactIds; 303 } 304 305 /** 306 * Returns the contactId with the best match score over the specified threshold or -1 307 * if no such contact is found. If multiple contacts are found, and 308 * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if 309 * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}. 310 */ pickBestMatch(int threshold, boolean allowMultipleMatches)311 public long pickBestMatch(int threshold, boolean allowMultipleMatches) { 312 long contactId = -1; 313 int maxScore = 0; 314 for (int i = 0; i < mScoreCount; i++) { 315 MatchScore score = mScoreList.get(i); 316 if (score.isKeepOut()) { 317 continue; 318 } 319 320 if (score.isKeepIn()) { 321 return score.getContactId(); 322 } 323 324 int s = score.getPrimaryScore(); 325 if (s == NO_DATA_SCORE) { 326 s = score.getSecondaryScore(); 327 } 328 329 if (s >= threshold) { 330 if (contactId != -1 && !allowMultipleMatches) { 331 return MULTIPLE_MATCHES; 332 } 333 // In order to make it stable, let's jut pick the one with the lowest ID 334 // if multiple candidates are found. 335 if ((s > maxScore) || ((s == maxScore) && (contactId > score.getContactId()))) { 336 contactId = score.getContactId(); 337 maxScore = s; 338 } 339 } 340 } 341 return contactId; 342 } 343 344 /** 345 * Returns matches in the order of descending score. 346 */ pickBestMatches(int threshold)347 public List<MatchScore> pickBestMatches(int threshold) { 348 int scaledThreshold = threshold * MatchScore.SCORE_SCALE; 349 List<MatchScore> matches = mScoreList.subList(0, mScoreCount); 350 Collections.sort(matches); 351 int count = 0; 352 for (int i = 0; i < mScoreCount; i++) { 353 MatchScore matchScore = matches.get(i); 354 if (matchScore.getScore() >= scaledThreshold) { 355 count++; 356 } else { 357 break; 358 } 359 } 360 361 return matches.subList(0, count); 362 } 363 364 @Override toString()365 public String toString() { 366 return mScoreList.subList(0, mScoreCount).toString(); 367 } 368 } 369