1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts; 17 18 import com.android.internal.util.HanziToPinyin; 19 import com.android.internal.util.HanziToPinyin.Token; 20 21 import android.content.ContentValues; 22 import android.provider.ContactsContract.FullNameStyle; 23 import android.provider.ContactsContract.PhoneticNameStyle; 24 import android.provider.ContactsContract.CommonDataKinds.StructuredName; 25 import android.text.TextUtils; 26 27 import java.lang.Character.UnicodeBlock; 28 import java.util.ArrayList; 29 import java.util.HashSet; 30 import java.util.Locale; 31 import java.util.StringTokenizer; 32 33 /** 34 * The purpose of this class is to split a full name into given names and last 35 * name. The logic only supports having a single last name. If the full name has 36 * multiple last names the output will be incorrect. 37 * <p> 38 * Core algorithm: 39 * <ol> 40 * <li>Remove the suffixes (III, Ph.D., M.D.).</li> 41 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li> 42 * <li>Assign the last remaining token as the last name.</li> 43 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use 44 * this word also as the last name.</li> 45 * <li>Assign the rest of the words as the "given names".</li> 46 * </ol> 47 */ 48 public class NameSplitter { 49 50 public static final int MAX_TOKENS = 10; 51 52 private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase(); 53 private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase(); 54 55 // This includes simplified and traditional Chinese 56 private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase(); 57 58 private final HashSet<String> mPrefixesSet; 59 private final HashSet<String> mSuffixesSet; 60 private final int mMaxSuffixLength; 61 private final HashSet<String> mLastNamePrefixesSet; 62 private final HashSet<String> mConjuctions; 63 private final Locale mLocale; 64 private final String mLanguage; 65 66 public static class Name { 67 public String prefix; 68 public String givenNames; 69 public String middleName; 70 public String familyName; 71 public String suffix; 72 73 public int fullNameStyle; 74 75 public String phoneticFamilyName; 76 public String phoneticMiddleName; 77 public String phoneticGivenName; 78 79 public int phoneticNameStyle; 80 Name()81 public Name() { 82 } 83 Name(String prefix, String givenNames, String middleName, String familyName, String suffix)84 public Name(String prefix, String givenNames, String middleName, String familyName, 85 String suffix) { 86 this.prefix = prefix; 87 this.givenNames = givenNames; 88 this.middleName = middleName; 89 this.familyName = familyName; 90 this.suffix = suffix; 91 } 92 getPrefix()93 public String getPrefix() { 94 return prefix; 95 } 96 getGivenNames()97 public String getGivenNames() { 98 return givenNames; 99 } 100 getMiddleName()101 public String getMiddleName() { 102 return middleName; 103 } 104 getFamilyName()105 public String getFamilyName() { 106 return familyName; 107 } 108 getSuffix()109 public String getSuffix() { 110 return suffix; 111 } 112 getFullNameStyle()113 public int getFullNameStyle() { 114 return fullNameStyle; 115 } 116 getPhoneticFamilyName()117 public String getPhoneticFamilyName() { 118 return phoneticFamilyName; 119 } 120 getPhoneticMiddleName()121 public String getPhoneticMiddleName() { 122 return phoneticMiddleName; 123 } 124 getPhoneticGivenName()125 public String getPhoneticGivenName() { 126 return phoneticGivenName; 127 } 128 getPhoneticNameStyle()129 public int getPhoneticNameStyle() { 130 return phoneticNameStyle; 131 } 132 fromValues(ContentValues values)133 public void fromValues(ContentValues values) { 134 prefix = values.getAsString(StructuredName.PREFIX); 135 givenNames = values.getAsString(StructuredName.GIVEN_NAME); 136 middleName = values.getAsString(StructuredName.MIDDLE_NAME); 137 familyName = values.getAsString(StructuredName.FAMILY_NAME); 138 suffix = values.getAsString(StructuredName.SUFFIX); 139 140 Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE); 141 fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer; 142 143 phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME); 144 phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME); 145 phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME); 146 147 integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE); 148 phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer; 149 } 150 toValues(ContentValues values)151 public void toValues(ContentValues values) { 152 putValueIfPresent(values, StructuredName.PREFIX, prefix); 153 putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames); 154 putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName); 155 putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName); 156 putValueIfPresent(values, StructuredName.SUFFIX, suffix); 157 values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle); 158 putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName); 159 putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName); 160 putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName); 161 values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle); 162 } 163 putValueIfPresent(ContentValues values, String name, String value)164 private void putValueIfPresent(ContentValues values, String name, String value) { 165 if (value != null) { 166 values.put(name, value); 167 } 168 } 169 clear()170 public void clear() { 171 prefix = null; 172 givenNames = null; 173 middleName = null; 174 familyName = null; 175 suffix = null; 176 fullNameStyle = FullNameStyle.UNDEFINED; 177 phoneticFamilyName = null; 178 phoneticMiddleName = null; 179 phoneticGivenName = null; 180 phoneticNameStyle = PhoneticNameStyle.UNDEFINED; 181 } 182 isEmpty()183 public boolean isEmpty() { 184 return TextUtils.isEmpty(givenNames) 185 && TextUtils.isEmpty(middleName) 186 && TextUtils.isEmpty(familyName) 187 && TextUtils.isEmpty(suffix) 188 && TextUtils.isEmpty(phoneticFamilyName) 189 && TextUtils.isEmpty(phoneticMiddleName) 190 && TextUtils.isEmpty(phoneticGivenName); 191 } 192 193 @Override toString()194 public String toString() { 195 return "[given: " + givenNames + " middle: " + middleName + " family: " + familyName 196 + " ph/given: " + phoneticGivenName + " ph/middle: " + phoneticMiddleName 197 + " ph/family: " + phoneticFamilyName + "]"; 198 } 199 200 } 201 202 private static class NameTokenizer extends StringTokenizer { 203 private final String[] mTokens; 204 private int mDotBitmask; 205 private int mCommaBitmask; 206 private int mStartPointer; 207 private int mEndPointer; 208 NameTokenizer(String fullName)209 public NameTokenizer(String fullName) { 210 super(fullName, " .,", true); 211 212 mTokens = new String[MAX_TOKENS]; 213 214 // Iterate over tokens, skipping over empty ones and marking tokens that 215 // are followed by dots. 216 while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { 217 final String token = nextToken(); 218 if (token.length() > 0) { 219 final char c = token.charAt(0); 220 if (c == ' ') { 221 continue; 222 } 223 } 224 225 if (mEndPointer > 0 && token.charAt(0) == '.') { 226 mDotBitmask |= (1 << (mEndPointer - 1)); 227 } else if (mEndPointer > 0 && token.charAt(0) == ',') { 228 mCommaBitmask |= (1 << (mEndPointer - 1)); 229 } else { 230 mTokens[mEndPointer] = token; 231 mEndPointer++; 232 } 233 } 234 } 235 236 /** 237 * Returns true if the token is followed by a dot in the original full name. 238 */ hasDot(int index)239 public boolean hasDot(int index) { 240 return (mDotBitmask & (1 << index)) != 0; 241 } 242 243 /** 244 * Returns true if the token is followed by a comma in the original full name. 245 */ hasComma(int index)246 public boolean hasComma(int index) { 247 return (mCommaBitmask & (1 << index)) != 0; 248 } 249 } 250 251 /** 252 * Constructor. 253 * 254 * @param commonPrefixes comma-separated list of common prefixes, 255 * e.g. "Mr, Ms, Mrs" 256 * @param commonLastNamePrefixes comma-separated list of common last name prefixes, 257 * e.g. "d', st, st., von" 258 * @param commonSuffixes comma-separated list of common suffixes, 259 * e.g. "Jr, M.D., MD, D.D.S." 260 * @param commonConjunctions comma-separated list of common conjuctions, 261 * e.g. "AND, Or" 262 */ NameSplitter(String commonPrefixes, String commonLastNamePrefixes, String commonSuffixes, String commonConjunctions, Locale locale)263 public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, 264 String commonSuffixes, String commonConjunctions, Locale locale) { 265 // TODO: refactor this to use <string-array> resources 266 mPrefixesSet = convertToSet(commonPrefixes); 267 mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); 268 mSuffixesSet = convertToSet(commonSuffixes); 269 mConjuctions = convertToSet(commonConjunctions); 270 mLocale = locale != null ? locale : Locale.getDefault(); 271 mLanguage = mLocale.getLanguage().toLowerCase(); 272 273 int maxLength = 0; 274 for (String suffix : mSuffixesSet) { 275 if (suffix.length() > maxLength) { 276 maxLength = suffix.length(); 277 } 278 } 279 280 mMaxSuffixLength = maxLength; 281 } 282 283 /** 284 * Converts a comma-separated list of Strings to a set of Strings. Trims strings 285 * and converts them to upper case. 286 */ convertToSet(String strings)287 private static HashSet<String> convertToSet(String strings) { 288 HashSet<String> set = new HashSet<String>(); 289 if (strings != null) { 290 String[] split = strings.split(","); 291 for (int i = 0; i < split.length; i++) { 292 set.add(split[i].trim().toUpperCase()); 293 } 294 } 295 return set; 296 } 297 298 /** 299 * Parses a full name and returns components as a list of tokens. 300 */ tokenize(String[] tokens, String fullName)301 public int tokenize(String[] tokens, String fullName) { 302 if (fullName == null) { 303 return 0; 304 } 305 306 NameTokenizer tokenizer = new NameTokenizer(fullName); 307 308 if (tokenizer.mStartPointer == tokenizer.mEndPointer) { 309 return 0; 310 } 311 312 String firstToken = tokenizer.mTokens[tokenizer.mStartPointer]; 313 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 314 tokenizer.mStartPointer++; 315 } 316 int count = 0; 317 for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) { 318 tokens[count++] = tokenizer.mTokens[i]; 319 } 320 321 return count; 322 } 323 324 325 /** 326 * Parses a full name and returns parsed components in the Name object. 327 */ split(Name name, String fullName)328 public void split(Name name, String fullName) { 329 if (fullName == null) { 330 return; 331 } 332 333 int fullNameStyle = guessFullNameStyle(fullName); 334 if (fullNameStyle == FullNameStyle.CJK) { 335 fullNameStyle = getAdjustedFullNameStyle(fullNameStyle); 336 } 337 338 name.fullNameStyle = fullNameStyle; 339 340 switch (fullNameStyle) { 341 case FullNameStyle.CHINESE: 342 splitChineseName(name, fullName); 343 break; 344 345 case FullNameStyle.JAPANESE: 346 case FullNameStyle.KOREAN: 347 splitJapaneseOrKoreanName(name, fullName); 348 break; 349 350 default: 351 splitWesternName(name, fullName); 352 } 353 } 354 355 /** 356 * Splits a full name composed according to the Western tradition: 357 * <pre> 358 * [prefix] given name(s) [[middle name] family name] [, suffix] 359 * [prefix] family name, given name [middle name] [,suffix] 360 * </pre> 361 */ splitWesternName(Name name, String fullName)362 private void splitWesternName(Name name, String fullName) { 363 NameTokenizer tokens = new NameTokenizer(fullName); 364 parsePrefix(name, tokens); 365 366 // If the name consists of just one or two tokens, treat them as first/last name, 367 // not as suffix. Example: John Ma; Ma is last name, not "M.A.". 368 if (tokens.mEndPointer > 2) { 369 parseSuffix(name, tokens); 370 } 371 372 if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) { 373 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 374 } else { 375 parseLastName(name, tokens); 376 parseMiddleName(name, tokens); 377 parseGivenNames(name, tokens); 378 } 379 } 380 381 /** 382 * Splits a full name composed according to the Chinese tradition: 383 * <pre> 384 * [family name [middle name]] given name 385 * </pre> 386 */ splitChineseName(Name name, String fullName)387 private void splitChineseName(Name name, String fullName) { 388 StringTokenizer tokenizer = new StringTokenizer(fullName); 389 while (tokenizer.hasMoreTokens()) { 390 String token = tokenizer.nextToken(); 391 if (name.givenNames == null) { 392 name.givenNames = token; 393 } else if (name.familyName == null) { 394 name.familyName = name.givenNames; 395 name.givenNames = token; 396 } else if (name.middleName == null) { 397 name.middleName = name.givenNames; 398 name.givenNames = token; 399 } else { 400 name.middleName = name.middleName + name.givenNames; 401 name.givenNames = token; 402 } 403 } 404 405 // If a single word parse that word up. 406 if (name.givenNames != null && name.familyName == null && name.middleName == null) { 407 int length = fullName.length(); 408 if (length == 2) { 409 name.familyName = fullName.substring(0, 1); 410 name.givenNames = fullName.substring(1); 411 } else if (length == 3) { 412 name.familyName = fullName.substring(0, 1); 413 name.middleName = fullName.substring(1, 2); 414 name.givenNames = fullName.substring(2); 415 } else if (length == 4) { 416 name.familyName = fullName.substring(0, 2); 417 name.middleName = fullName.substring(2, 3); 418 name.givenNames = fullName.substring(3); 419 } 420 421 } 422 } 423 424 /** 425 * Splits a full name composed according to the Japanese tradition: 426 * <pre> 427 * [family name] given name(s) 428 * </pre> 429 */ splitJapaneseOrKoreanName(Name name, String fullName)430 private void splitJapaneseOrKoreanName(Name name, String fullName) { 431 StringTokenizer tokenizer = new StringTokenizer(fullName); 432 while (tokenizer.hasMoreTokens()) { 433 String token = tokenizer.nextToken(); 434 if (name.givenNames == null) { 435 name.givenNames = token; 436 } else if (name.familyName == null) { 437 name.familyName = name.givenNames; 438 name.givenNames = token; 439 } else { 440 name.givenNames += " " + token; 441 } 442 } 443 } 444 445 /** 446 * Concatenates components of a name according to the rules dictated by the name style. 447 * 448 * @param givenNameFirst is ignored for CJK display name styles 449 */ join(Name name, boolean givenNameFirst)450 public String join(Name name, boolean givenNameFirst) { 451 switch (name.fullNameStyle) { 452 case FullNameStyle.CJK: 453 case FullNameStyle.CHINESE: 454 case FullNameStyle.KOREAN: 455 return join(name.familyName, name.middleName, name.givenNames, name.suffix, 456 false, false, false); 457 458 case FullNameStyle.JAPANESE: 459 return join(name.familyName, name.middleName, name.givenNames, name.suffix, 460 true, false, false); 461 462 default: 463 if (givenNameFirst) { 464 return join(name.givenNames, name.middleName, name.familyName, name.suffix, 465 true, false, true); 466 } else { 467 return join(name.familyName, name.givenNames, name.middleName, name.suffix, 468 true, true, true); 469 } 470 } 471 } 472 473 /** 474 * Concatenates components of the phonetic name following the CJK tradition: 475 * family name + middle name + given name(s). 476 */ joinPhoneticName(Name name)477 public String joinPhoneticName(Name name) { 478 return join(name.phoneticFamilyName, name.phoneticMiddleName, 479 name.phoneticGivenName, null, true, false, false); 480 } 481 482 /** 483 * Concatenates parts of a full name inserting spaces and commas as specified. 484 */ join(String part1, String part2, String part3, String suffix, boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3)485 private String join(String part1, String part2, String part3, String suffix, 486 boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) { 487 boolean hasPart1 = !TextUtils.isEmpty(part1); 488 boolean hasPart2 = !TextUtils.isEmpty(part2); 489 boolean hasPart3 = !TextUtils.isEmpty(part3); 490 boolean hasSuffix = !TextUtils.isEmpty(suffix); 491 492 boolean isSingleWord = true; 493 String singleWord = null; 494 if (hasPart1) { 495 singleWord = part1; 496 } 497 498 if (hasPart2) { 499 if (singleWord != null) { 500 isSingleWord = false; 501 } else { 502 singleWord = part2; 503 } 504 } 505 506 if (hasPart3) { 507 if (singleWord != null) { 508 isSingleWord = false; 509 } else { 510 singleWord = part3; 511 } 512 } 513 514 if (hasSuffix) { 515 if (singleWord != null) { 516 isSingleWord = false; 517 } else { 518 singleWord = normalizedSuffix(suffix); 519 } 520 } 521 522 if (isSingleWord) { 523 return singleWord; 524 } 525 526 StringBuilder sb = new StringBuilder(); 527 if (hasPart1) { 528 sb.append(part1); 529 } 530 531 if (hasPart2) { 532 if (hasPart1) { 533 if (useCommaAfterPart1) { 534 sb.append(','); 535 } 536 if (useSpace) { 537 sb.append(' '); 538 } 539 } 540 sb.append(part2); 541 } 542 543 if (hasPart3) { 544 if (hasPart1 || hasPart2) { 545 if (useSpace) { 546 sb.append(' '); 547 } 548 } 549 sb.append(part3); 550 } 551 552 if (hasSuffix) { 553 if (hasPart1 || hasPart2 || hasPart3) { 554 if (useCommaAfterPart3) { 555 sb.append(','); 556 } 557 if (useSpace) { 558 sb.append(' '); 559 } 560 } 561 sb.append(normalizedSuffix(suffix)); 562 } 563 564 return sb.toString(); 565 } 566 567 /** 568 * Puts a dot after the supplied suffix if that is the accepted form of the suffix, 569 * e.g. "Jr." and "Sr.", but not "I", "II" and "III". 570 */ normalizedSuffix(String suffix)571 private String normalizedSuffix(String suffix) { 572 int length = suffix.length(); 573 if (length == 0 || suffix.charAt(length - 1) == '.') { 574 return suffix; 575 } 576 577 String withDot = suffix + '.'; 578 if (mSuffixesSet.contains(withDot.toUpperCase())) { 579 return withDot; 580 } else { 581 return suffix; 582 } 583 } 584 585 /** 586 * If the supplied name style is undefined, returns a default based on the language, 587 * otherwise returns the supplied name style itself. 588 * 589 * @param nameStyle See {@link FullNameStyle}. 590 */ getAdjustedFullNameStyle(int nameStyle)591 public int getAdjustedFullNameStyle(int nameStyle) { 592 if (nameStyle == FullNameStyle.UNDEFINED) { 593 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 594 return FullNameStyle.JAPANESE; 595 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 596 return FullNameStyle.KOREAN; 597 } else if (CHINESE_LANGUAGE.equals(mLanguage)) { 598 return FullNameStyle.CHINESE; 599 } else { 600 return FullNameStyle.WESTERN; 601 } 602 } else if (nameStyle == FullNameStyle.CJK) { 603 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 604 return FullNameStyle.JAPANESE; 605 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 606 return FullNameStyle.KOREAN; 607 } else { 608 return FullNameStyle.CHINESE; 609 } 610 } 611 return nameStyle; 612 } 613 614 /** 615 * Parses the first word from the name if it is a prefix. 616 */ parsePrefix(Name name, NameTokenizer tokens)617 private void parsePrefix(Name name, NameTokenizer tokens) { 618 if (tokens.mStartPointer == tokens.mEndPointer) { 619 return; 620 } 621 622 String firstToken = tokens.mTokens[tokens.mStartPointer]; 623 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 624 name.prefix = firstToken; 625 tokens.mStartPointer++; 626 } 627 } 628 629 /** 630 * Parses the last word(s) from the name if it is a suffix. 631 */ parseSuffix(Name name, NameTokenizer tokens)632 private void parseSuffix(Name name, NameTokenizer tokens) { 633 if (tokens.mStartPointer == tokens.mEndPointer) { 634 return; 635 } 636 637 String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; 638 if (lastToken.length() > mMaxSuffixLength) { 639 return; 640 } 641 642 String normalized = lastToken.toUpperCase(); 643 if (mSuffixesSet.contains(normalized)) { 644 name.suffix = lastToken; 645 tokens.mEndPointer--; 646 return; 647 } 648 649 if (tokens.hasDot(tokens.mEndPointer - 1)) { 650 lastToken += '.'; 651 } 652 normalized += "."; 653 654 // Take care of suffixes like M.D. and D.D.S. 655 int pos = tokens.mEndPointer - 1; 656 while (normalized.length() <= mMaxSuffixLength) { 657 658 if (mSuffixesSet.contains(normalized)) { 659 name.suffix = lastToken; 660 tokens.mEndPointer = pos; 661 return; 662 } 663 664 if (pos == tokens.mStartPointer) { 665 break; 666 } 667 668 pos--; 669 if (tokens.hasDot(pos)) { 670 lastToken = tokens.mTokens[pos] + "." + lastToken; 671 } else { 672 lastToken = tokens.mTokens[pos] + " " + lastToken; 673 } 674 675 normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; 676 } 677 } 678 parseLastName(Name name, NameTokenizer tokens)679 private void parseLastName(Name name, NameTokenizer tokens) { 680 if (tokens.mStartPointer == tokens.mEndPointer) { 681 return; 682 } 683 684 // If the first word is followed by a comma, assume that it's the family name 685 if (tokens.hasComma(tokens.mStartPointer)) { 686 name.familyName = tokens.mTokens[tokens.mStartPointer]; 687 tokens.mStartPointer++; 688 return; 689 } 690 691 // If the second word is followed by a comma and the first word 692 // is a last name prefix as in "de Sade" and "von Cliburn", treat 693 // the first two words as the family name. 694 if (tokens.mStartPointer + 1 < tokens.mEndPointer 695 && tokens.hasComma(tokens.mStartPointer + 1) 696 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) { 697 String familyNamePrefix = tokens.mTokens[tokens.mStartPointer]; 698 if (tokens.hasDot(tokens.mStartPointer)) { 699 familyNamePrefix += '.'; 700 } 701 name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1]; 702 tokens.mStartPointer += 2; 703 return; 704 } 705 706 // Finally, assume that the last word is the last name 707 name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; 708 tokens.mEndPointer--; 709 710 // Take care of last names like "de Sade" and "von Cliburn" 711 if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { 712 String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; 713 if (isFamilyNamePrefix(lastNamePrefix)) { 714 if (tokens.hasDot(tokens.mEndPointer - 1)) { 715 lastNamePrefix += '.'; 716 } 717 name.familyName = lastNamePrefix + " " + name.familyName; 718 tokens.mEndPointer--; 719 } 720 } 721 } 722 723 /** 724 * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de" 725 */ isFamilyNamePrefix(String word)726 private boolean isFamilyNamePrefix(String word) { 727 final String normalized = word.toUpperCase(); 728 729 return mLastNamePrefixesSet.contains(normalized) 730 || mLastNamePrefixesSet.contains(normalized + "."); 731 } 732 733 parseMiddleName(Name name, NameTokenizer tokens)734 private void parseMiddleName(Name name, NameTokenizer tokens) { 735 if (tokens.mStartPointer == tokens.mEndPointer) { 736 return; 737 } 738 739 if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { 740 if ((tokens.mEndPointer - tokens.mStartPointer) == 2 741 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. 742 toUpperCase())) { 743 name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; 744 if (tokens.hasDot(tokens.mEndPointer - 1)) { 745 name.middleName += '.'; 746 } 747 tokens.mEndPointer--; 748 } 749 } 750 } 751 parseGivenNames(Name name, NameTokenizer tokens)752 private void parseGivenNames(Name name, NameTokenizer tokens) { 753 if (tokens.mStartPointer == tokens.mEndPointer) { 754 return; 755 } 756 757 if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { 758 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 759 } else { 760 StringBuilder sb = new StringBuilder(); 761 for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { 762 if (i != tokens.mStartPointer) { 763 sb.append(' '); 764 } 765 sb.append(tokens.mTokens[i]); 766 if (tokens.hasDot(i)) { 767 sb.append('.'); 768 } 769 } 770 name.givenNames = sb.toString(); 771 } 772 } 773 774 /** 775 * Makes the best guess at the expected full name style based on the character set 776 * used in the supplied name. If the phonetic name is also supplied, tries to 777 * differentiate between Chinese, Japanese and Korean based on the alphabet used 778 * for the phonetic name. 779 */ guessNameStyle(Name name)780 public void guessNameStyle(Name name) { 781 guessFullNameStyle(name); 782 guessPhoneticNameStyle(name); 783 name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle, 784 name.phoneticNameStyle); 785 } 786 787 /** 788 * Updates the display name style according to the phonetic name style if we 789 * were unsure about display name style based on the name components, but 790 * phonetic name makes it more definitive. 791 */ getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle)792 public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) { 793 if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 794 if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) { 795 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) { 796 return FullNameStyle.JAPANESE; 797 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) { 798 return FullNameStyle.KOREAN; 799 } 800 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) { 801 return FullNameStyle.CHINESE; 802 } 803 } 804 } 805 return nameStyle; 806 } 807 808 /** 809 * Makes the best guess at the expected full name style based on the character set 810 * used in the supplied name. 811 */ guessFullNameStyle(NameSplitter.Name name)812 private void guessFullNameStyle(NameSplitter.Name name) { 813 if (name.fullNameStyle != FullNameStyle.UNDEFINED) { 814 return; 815 } 816 817 int bestGuess = guessFullNameStyle(name.givenNames); 818 // A mix of Hanzi and latin chars are common in China, so we have to go through all names 819 // if the name is not JANPANESE or KOREAN. 820 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK 821 && bestGuess != FullNameStyle.WESTERN) { 822 name.fullNameStyle = bestGuess; 823 return; 824 } 825 826 int guess = guessFullNameStyle(name.familyName); 827 if (guess != FullNameStyle.UNDEFINED) { 828 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 829 name.fullNameStyle = guess; 830 return; 831 } 832 bestGuess = guess; 833 } 834 835 guess = guessFullNameStyle(name.middleName); 836 if (guess != FullNameStyle.UNDEFINED) { 837 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 838 name.fullNameStyle = guess; 839 return; 840 } 841 bestGuess = guess; 842 } 843 844 name.fullNameStyle = bestGuess; 845 } 846 guessFullNameStyle(String name)847 public int guessFullNameStyle(String name) { 848 if (name == null) { 849 return FullNameStyle.UNDEFINED; 850 } 851 852 int nameStyle = FullNameStyle.UNDEFINED; 853 int length = name.length(); 854 int offset = 0; 855 while (offset < length) { 856 int codePoint = Character.codePointAt(name, offset); 857 if (Character.isLetter(codePoint)) { 858 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 859 860 if (!isLatinUnicodeBlock(unicodeBlock)) { 861 862 if (isCJKUnicodeBlock(unicodeBlock)) { 863 // We don't know if this is Chinese, Japanese or Korean - 864 // trying to figure out by looking at other characters in the name 865 return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); 866 } 867 868 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 869 return FullNameStyle.JAPANESE; 870 } 871 872 if (isKoreanUnicodeBlock(unicodeBlock)) { 873 return FullNameStyle.KOREAN; 874 } 875 } 876 nameStyle = FullNameStyle.WESTERN; 877 } 878 offset += Character.charCount(codePoint); 879 } 880 return nameStyle; 881 } 882 guessCJKNameStyle(String name, int offset)883 private int guessCJKNameStyle(String name, int offset) { 884 int length = name.length(); 885 while (offset < length) { 886 int codePoint = Character.codePointAt(name, offset); 887 if (Character.isLetter(codePoint)) { 888 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 889 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 890 return FullNameStyle.JAPANESE; 891 } 892 if (isKoreanUnicodeBlock(unicodeBlock)) { 893 return FullNameStyle.KOREAN; 894 } 895 } 896 offset += Character.charCount(codePoint); 897 } 898 899 return FullNameStyle.CJK; 900 } 901 guessPhoneticNameStyle(NameSplitter.Name name)902 private void guessPhoneticNameStyle(NameSplitter.Name name) { 903 if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 904 return; 905 } 906 907 int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName); 908 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) { 909 name.phoneticNameStyle = bestGuess; 910 return; 911 } 912 913 int guess = guessPhoneticNameStyle(name.phoneticGivenName); 914 if (guess != FullNameStyle.UNDEFINED) { 915 if (guess != FullNameStyle.CJK) { 916 name.phoneticNameStyle = guess; 917 return; 918 } 919 bestGuess = guess; 920 } 921 922 guess = guessPhoneticNameStyle(name.phoneticMiddleName); 923 if (guess != FullNameStyle.UNDEFINED) { 924 if (guess != FullNameStyle.CJK) { 925 name.phoneticNameStyle = guess; 926 return; 927 } 928 bestGuess = guess; 929 } 930 } 931 guessPhoneticNameStyle(String name)932 public int guessPhoneticNameStyle(String name) { 933 if (name == null) { 934 return PhoneticNameStyle.UNDEFINED; 935 } 936 937 int nameStyle = PhoneticNameStyle.UNDEFINED; 938 int length = name.length(); 939 int offset = 0; 940 while (offset < length) { 941 int codePoint = Character.codePointAt(name, offset); 942 if (Character.isLetter(codePoint)) { 943 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 944 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 945 return PhoneticNameStyle.JAPANESE; 946 } 947 if (isKoreanUnicodeBlock(unicodeBlock)) { 948 return PhoneticNameStyle.KOREAN; 949 } 950 if (isLatinUnicodeBlock(unicodeBlock)) { 951 return PhoneticNameStyle.PINYIN; 952 } 953 } 954 offset += Character.charCount(codePoint); 955 } 956 957 return nameStyle; 958 } 959 isLatinUnicodeBlock(UnicodeBlock unicodeBlock)960 private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) { 961 return unicodeBlock == UnicodeBlock.BASIC_LATIN || 962 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT || 963 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A || 964 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B || 965 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL; 966 } 967 isCJKUnicodeBlock(UnicodeBlock block)968 private static boolean isCJKUnicodeBlock(UnicodeBlock block) { 969 return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 970 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 971 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 972 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 973 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT 974 || block == UnicodeBlock.CJK_COMPATIBILITY 975 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS 976 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 977 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT; 978 } 979 isKoreanUnicodeBlock(UnicodeBlock unicodeBlock)980 private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) { 981 return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES || 982 unicodeBlock == UnicodeBlock.HANGUL_JAMO || 983 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO; 984 } 985 isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock)986 private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) { 987 return unicodeBlock == UnicodeBlock.KATAKANA || 988 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || 989 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || 990 unicodeBlock == UnicodeBlock.HIRAGANA; 991 } 992 } 993