• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts;
17 
18 import com.android.internal.util.HanziToPinyin;
19 import com.android.internal.util.HanziToPinyin.Token;
20 
21 import android.content.ContentValues;
22 import android.provider.ContactsContract.FullNameStyle;
23 import android.provider.ContactsContract.PhoneticNameStyle;
24 import android.provider.ContactsContract.CommonDataKinds.StructuredName;
25 import android.text.TextUtils;
26 
27 import java.lang.Character.UnicodeBlock;
28 import java.util.ArrayList;
29 import java.util.HashSet;
30 import java.util.Locale;
31 import java.util.StringTokenizer;
32 
33 /**
34  * The purpose of this class is to split a full name into given names and last
35  * name. The logic only supports having a single last name. If the full name has
36  * multiple last names the output will be incorrect.
37  * <p>
38  * Core algorithm:
39  * <ol>
40  * <li>Remove the suffixes (III, Ph.D., M.D.).</li>
41  * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li>
42  * <li>Assign the last remaining token as the last name.</li>
43  * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use
44  * this word also as the last name.</li>
45  * <li>Assign the rest of the words as the "given names".</li>
46  * </ol>
47  */
48 public class NameSplitter {
49 
50     public static final int MAX_TOKENS = 10;
51 
52     private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase();
53     private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase();
54 
55     // This includes simplified and traditional Chinese
56     private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase();
57 
58     private final HashSet<String> mPrefixesSet;
59     private final HashSet<String> mSuffixesSet;
60     private final int mMaxSuffixLength;
61     private final HashSet<String> mLastNamePrefixesSet;
62     private final HashSet<String> mConjuctions;
63     private final Locale mLocale;
64     private final String mLanguage;
65 
66     public static class Name {
67         public String prefix;
68         public String givenNames;
69         public String middleName;
70         public String familyName;
71         public String suffix;
72 
73         public int fullNameStyle;
74 
75         public String phoneticFamilyName;
76         public String phoneticMiddleName;
77         public String phoneticGivenName;
78 
79         public int phoneticNameStyle;
80 
Name()81         public Name() {
82         }
83 
Name(String prefix, String givenNames, String middleName, String familyName, String suffix)84         public Name(String prefix, String givenNames, String middleName, String familyName,
85                 String suffix) {
86             this.prefix = prefix;
87             this.givenNames = givenNames;
88             this.middleName = middleName;
89             this.familyName = familyName;
90             this.suffix = suffix;
91         }
92 
getPrefix()93         public String getPrefix() {
94             return prefix;
95         }
96 
getGivenNames()97         public String getGivenNames() {
98             return givenNames;
99         }
100 
getMiddleName()101         public String getMiddleName() {
102             return middleName;
103         }
104 
getFamilyName()105         public String getFamilyName() {
106             return familyName;
107         }
108 
getSuffix()109         public String getSuffix() {
110             return suffix;
111         }
112 
getFullNameStyle()113         public int getFullNameStyle() {
114             return fullNameStyle;
115         }
116 
getPhoneticFamilyName()117         public String getPhoneticFamilyName() {
118             return phoneticFamilyName;
119         }
120 
getPhoneticMiddleName()121         public String getPhoneticMiddleName() {
122             return phoneticMiddleName;
123         }
124 
getPhoneticGivenName()125         public String getPhoneticGivenName() {
126             return phoneticGivenName;
127         }
128 
getPhoneticNameStyle()129         public int getPhoneticNameStyle() {
130             return phoneticNameStyle;
131         }
132 
fromValues(ContentValues values)133         public void fromValues(ContentValues values) {
134             prefix = values.getAsString(StructuredName.PREFIX);
135             givenNames = values.getAsString(StructuredName.GIVEN_NAME);
136             middleName = values.getAsString(StructuredName.MIDDLE_NAME);
137             familyName = values.getAsString(StructuredName.FAMILY_NAME);
138             suffix = values.getAsString(StructuredName.SUFFIX);
139 
140             Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE);
141             fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer;
142 
143             phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME);
144             phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME);
145             phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME);
146 
147             integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE);
148             phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer;
149         }
150 
toValues(ContentValues values)151         public void toValues(ContentValues values) {
152             putValueIfPresent(values, StructuredName.PREFIX, prefix);
153             putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames);
154             putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName);
155             putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName);
156             putValueIfPresent(values, StructuredName.SUFFIX, suffix);
157             values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle);
158             putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName);
159             putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName);
160             putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName);
161             values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle);
162         }
163 
putValueIfPresent(ContentValues values, String name, String value)164         private void putValueIfPresent(ContentValues values, String name, String value) {
165             if (value != null) {
166                 values.put(name, value);
167             }
168         }
169 
clear()170         public void clear() {
171             prefix = null;
172             givenNames = null;
173             middleName = null;
174             familyName = null;
175             suffix = null;
176             fullNameStyle = FullNameStyle.UNDEFINED;
177             phoneticFamilyName = null;
178             phoneticMiddleName = null;
179             phoneticGivenName = null;
180             phoneticNameStyle = PhoneticNameStyle.UNDEFINED;
181         }
182 
isEmpty()183         public boolean isEmpty() {
184             return TextUtils.isEmpty(givenNames)
185                     && TextUtils.isEmpty(middleName)
186                     && TextUtils.isEmpty(familyName)
187                     && TextUtils.isEmpty(suffix)
188                     && TextUtils.isEmpty(phoneticFamilyName)
189                     && TextUtils.isEmpty(phoneticMiddleName)
190                     && TextUtils.isEmpty(phoneticGivenName);
191         }
192 
193         @Override
toString()194         public String toString() {
195             return "[given: " + givenNames + " middle: " + middleName + " family: " + familyName
196                     + " ph/given: " + phoneticGivenName + " ph/middle: " + phoneticMiddleName
197                     + " ph/family: " + phoneticFamilyName + "]";
198         }
199 
200     }
201 
202     private static class NameTokenizer extends StringTokenizer {
203         private final String[] mTokens;
204         private int mDotBitmask;
205         private int mCommaBitmask;
206         private int mStartPointer;
207         private int mEndPointer;
208 
NameTokenizer(String fullName)209         public NameTokenizer(String fullName) {
210             super(fullName, " .,", true);
211 
212             mTokens = new String[MAX_TOKENS];
213 
214             // Iterate over tokens, skipping over empty ones and marking tokens that
215             // are followed by dots.
216             while (hasMoreTokens() && mEndPointer < MAX_TOKENS) {
217                 final String token = nextToken();
218                 if (token.length() > 0) {
219                     final char c = token.charAt(0);
220                     if (c == ' ') {
221                         continue;
222                     }
223                 }
224 
225                 if (mEndPointer > 0 && token.charAt(0) == '.') {
226                     mDotBitmask |= (1 << (mEndPointer - 1));
227                 } else if (mEndPointer > 0 && token.charAt(0) == ',') {
228                     mCommaBitmask |= (1 << (mEndPointer - 1));
229                 } else {
230                     mTokens[mEndPointer] = token;
231                     mEndPointer++;
232                 }
233             }
234         }
235 
236         /**
237          * Returns true if the token is followed by a dot in the original full name.
238          */
hasDot(int index)239         public boolean hasDot(int index) {
240             return (mDotBitmask & (1 << index)) != 0;
241         }
242 
243         /**
244          * Returns true if the token is followed by a comma in the original full name.
245          */
hasComma(int index)246         public boolean hasComma(int index) {
247             return (mCommaBitmask & (1 << index)) != 0;
248         }
249     }
250 
251     /**
252      * Constructor.
253      *
254      * @param commonPrefixes comma-separated list of common prefixes,
255      *            e.g. "Mr, Ms, Mrs"
256      * @param commonLastNamePrefixes comma-separated list of common last name prefixes,
257      *            e.g. "d', st, st., von"
258      * @param commonSuffixes comma-separated list of common suffixes,
259      *            e.g. "Jr, M.D., MD, D.D.S."
260      * @param commonConjunctions comma-separated list of common conjuctions,
261      *            e.g. "AND, Or"
262      */
NameSplitter(String commonPrefixes, String commonLastNamePrefixes, String commonSuffixes, String commonConjunctions, Locale locale)263     public NameSplitter(String commonPrefixes, String commonLastNamePrefixes,
264             String commonSuffixes, String commonConjunctions, Locale locale) {
265         // TODO: refactor this to use <string-array> resources
266         mPrefixesSet = convertToSet(commonPrefixes);
267         mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes);
268         mSuffixesSet = convertToSet(commonSuffixes);
269         mConjuctions = convertToSet(commonConjunctions);
270         mLocale = locale != null ? locale : Locale.getDefault();
271         mLanguage = mLocale.getLanguage().toLowerCase();
272 
273         int maxLength = 0;
274         for (String suffix : mSuffixesSet) {
275             if (suffix.length() > maxLength) {
276                 maxLength = suffix.length();
277             }
278         }
279 
280         mMaxSuffixLength = maxLength;
281     }
282 
283     /**
284      * Converts a comma-separated list of Strings to a set of Strings. Trims strings
285      * and converts them to upper case.
286      */
convertToSet(String strings)287     private static HashSet<String> convertToSet(String strings) {
288         HashSet<String> set = new HashSet<String>();
289         if (strings != null) {
290             String[] split = strings.split(",");
291             for (int i = 0; i < split.length; i++) {
292                 set.add(split[i].trim().toUpperCase());
293             }
294         }
295         return set;
296     }
297 
298     /**
299      * Parses a full name and returns components as a list of tokens.
300      */
tokenize(String[] tokens, String fullName)301     public int tokenize(String[] tokens, String fullName) {
302         if (fullName == null) {
303             return 0;
304         }
305 
306         NameTokenizer tokenizer = new NameTokenizer(fullName);
307 
308         if (tokenizer.mStartPointer == tokenizer.mEndPointer) {
309             return 0;
310         }
311 
312         String firstToken = tokenizer.mTokens[tokenizer.mStartPointer];
313         if (mPrefixesSet.contains(firstToken.toUpperCase())) {
314            tokenizer.mStartPointer++;
315         }
316         int count = 0;
317         for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) {
318             tokens[count++] = tokenizer.mTokens[i];
319         }
320 
321         return count;
322     }
323 
324 
325     /**
326      * Parses a full name and returns parsed components in the Name object.
327      */
split(Name name, String fullName)328     public void split(Name name, String fullName) {
329         if (fullName == null) {
330             return;
331         }
332 
333         int fullNameStyle = guessFullNameStyle(fullName);
334         if (fullNameStyle == FullNameStyle.CJK) {
335             fullNameStyle = getAdjustedFullNameStyle(fullNameStyle);
336         }
337 
338         name.fullNameStyle = fullNameStyle;
339 
340         switch (fullNameStyle) {
341             case FullNameStyle.CHINESE:
342                 splitChineseName(name, fullName);
343                 break;
344 
345             case FullNameStyle.JAPANESE:
346             case FullNameStyle.KOREAN:
347                 splitJapaneseOrKoreanName(name, fullName);
348                 break;
349 
350             default:
351                 splitWesternName(name, fullName);
352         }
353     }
354 
355     /**
356      * Splits a full name composed according to the Western tradition:
357      * <pre>
358      *   [prefix] given name(s) [[middle name] family name] [, suffix]
359      *   [prefix] family name, given name [middle name] [,suffix]
360      * </pre>
361      */
splitWesternName(Name name, String fullName)362     private void splitWesternName(Name name, String fullName) {
363         NameTokenizer tokens = new NameTokenizer(fullName);
364         parsePrefix(name, tokens);
365 
366         // If the name consists of just one or two tokens, treat them as first/last name,
367         // not as suffix.  Example: John Ma; Ma is last name, not "M.A.".
368         if (tokens.mEndPointer > 2) {
369             parseSuffix(name, tokens);
370         }
371 
372         if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) {
373             name.givenNames = tokens.mTokens[tokens.mStartPointer];
374         } else {
375             parseLastName(name, tokens);
376             parseMiddleName(name, tokens);
377             parseGivenNames(name, tokens);
378         }
379     }
380 
381     /**
382      * Splits a full name composed according to the Chinese tradition:
383      * <pre>
384      *   [family name [middle name]] given name
385      * </pre>
386      */
splitChineseName(Name name, String fullName)387     private void splitChineseName(Name name, String fullName) {
388         StringTokenizer tokenizer = new StringTokenizer(fullName);
389         while (tokenizer.hasMoreTokens()) {
390             String token = tokenizer.nextToken();
391             if (name.givenNames == null) {
392                 name.givenNames = token;
393             } else if (name.familyName == null) {
394                 name.familyName = name.givenNames;
395                 name.givenNames = token;
396             } else if (name.middleName == null) {
397                 name.middleName = name.givenNames;
398                 name.givenNames = token;
399             } else {
400                 name.middleName = name.middleName + name.givenNames;
401                 name.givenNames = token;
402             }
403         }
404 
405         // If a single word parse that word up.
406         if (name.givenNames != null && name.familyName == null && name.middleName == null) {
407             int length = fullName.length();
408             if (length == 2) {
409                 name.familyName = fullName.substring(0, 1);
410                 name.givenNames = fullName.substring(1);
411             } else if (length == 3) {
412                 name.familyName = fullName.substring(0, 1);
413                 name.middleName = fullName.substring(1, 2);
414                 name.givenNames = fullName.substring(2);
415             } else if (length == 4) {
416                 name.familyName = fullName.substring(0, 2);
417                 name.middleName = fullName.substring(2, 3);
418                 name.givenNames = fullName.substring(3);
419             }
420 
421         }
422     }
423 
424     /**
425      * Splits a full name composed according to the Japanese tradition:
426      * <pre>
427      *   [family name] given name(s)
428      * </pre>
429      */
splitJapaneseOrKoreanName(Name name, String fullName)430     private void splitJapaneseOrKoreanName(Name name, String fullName) {
431         StringTokenizer tokenizer = new StringTokenizer(fullName);
432         while (tokenizer.hasMoreTokens()) {
433             String token = tokenizer.nextToken();
434             if (name.givenNames == null) {
435                 name.givenNames = token;
436             } else if (name.familyName == null) {
437                 name.familyName = name.givenNames;
438                 name.givenNames = token;
439             } else {
440                 name.givenNames += " " + token;
441             }
442         }
443     }
444 
445     /**
446      * Concatenates components of a name according to the rules dictated by the name style.
447      *
448      * @param givenNameFirst is ignored for CJK display name styles
449      */
join(Name name, boolean givenNameFirst)450     public String join(Name name, boolean givenNameFirst) {
451         switch (name.fullNameStyle) {
452             case FullNameStyle.CJK:
453             case FullNameStyle.CHINESE:
454             case FullNameStyle.KOREAN:
455                 return join(name.familyName, name.middleName, name.givenNames, name.suffix,
456                         false, false, false);
457 
458             case FullNameStyle.JAPANESE:
459                 return join(name.familyName, name.middleName, name.givenNames, name.suffix,
460                         true, false, false);
461 
462             default:
463                 if (givenNameFirst) {
464                     return join(name.givenNames, name.middleName, name.familyName, name.suffix,
465                             true, false, true);
466                 } else {
467                     return join(name.familyName, name.givenNames, name.middleName, name.suffix,
468                             true, true, true);
469                 }
470         }
471     }
472 
473     /**
474      * Concatenates components of the phonetic name following the CJK tradition:
475      * family name + middle name + given name(s).
476      */
joinPhoneticName(Name name)477     public String joinPhoneticName(Name name) {
478         return join(name.phoneticFamilyName, name.phoneticMiddleName,
479                 name.phoneticGivenName, null, true, false, false);
480     }
481 
482     /**
483      * Concatenates parts of a full name inserting spaces and commas as specified.
484      */
join(String part1, String part2, String part3, String suffix, boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3)485     private String join(String part1, String part2, String part3, String suffix,
486             boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) {
487         boolean hasPart1 = !TextUtils.isEmpty(part1);
488         boolean hasPart2 = !TextUtils.isEmpty(part2);
489         boolean hasPart3 = !TextUtils.isEmpty(part3);
490         boolean hasSuffix = !TextUtils.isEmpty(suffix);
491 
492         boolean isSingleWord = true;
493         String singleWord = null;
494         if (hasPart1) {
495             singleWord = part1;
496         }
497 
498         if (hasPart2) {
499             if (singleWord != null) {
500                 isSingleWord = false;
501             } else {
502                 singleWord = part2;
503             }
504         }
505 
506         if (hasPart3) {
507             if (singleWord != null) {
508                 isSingleWord = false;
509             } else {
510                 singleWord = part3;
511             }
512         }
513 
514         if (hasSuffix) {
515             if (singleWord != null) {
516                 isSingleWord = false;
517             } else {
518                 singleWord = normalizedSuffix(suffix);
519             }
520         }
521 
522         if (isSingleWord) {
523             return singleWord;
524         }
525 
526         StringBuilder sb = new StringBuilder();
527         if (hasPart1) {
528             sb.append(part1);
529         }
530 
531         if (hasPart2) {
532             if (hasPart1) {
533                 if (useCommaAfterPart1) {
534                     sb.append(',');
535                 }
536                 if (useSpace) {
537                     sb.append(' ');
538                 }
539             }
540             sb.append(part2);
541         }
542 
543         if (hasPart3) {
544             if (hasPart1 || hasPart2) {
545                 if (useSpace) {
546                     sb.append(' ');
547                 }
548             }
549             sb.append(part3);
550         }
551 
552         if (hasSuffix) {
553             if (hasPart1 || hasPart2 || hasPart3) {
554                 if (useCommaAfterPart3) {
555                     sb.append(',');
556                 }
557                 if (useSpace) {
558                     sb.append(' ');
559                 }
560             }
561             sb.append(normalizedSuffix(suffix));
562         }
563 
564         return sb.toString();
565     }
566 
567     /**
568      * Puts a dot after the supplied suffix if that is the accepted form of the suffix,
569      * e.g. "Jr." and "Sr.", but not "I", "II" and "III".
570      */
normalizedSuffix(String suffix)571     private String normalizedSuffix(String suffix) {
572         int length = suffix.length();
573         if (length == 0 || suffix.charAt(length - 1) == '.') {
574             return suffix;
575         }
576 
577         String withDot = suffix + '.';
578         if (mSuffixesSet.contains(withDot.toUpperCase())) {
579             return withDot;
580         } else {
581             return suffix;
582         }
583     }
584 
585     /**
586      * If the supplied name style is undefined, returns a default based on the language,
587      * otherwise returns the supplied name style itself.
588      *
589      * @param nameStyle See {@link FullNameStyle}.
590      */
getAdjustedFullNameStyle(int nameStyle)591     public int getAdjustedFullNameStyle(int nameStyle) {
592         if (nameStyle == FullNameStyle.UNDEFINED) {
593             if (JAPANESE_LANGUAGE.equals(mLanguage)) {
594                 return FullNameStyle.JAPANESE;
595             } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
596                 return FullNameStyle.KOREAN;
597             } else if (CHINESE_LANGUAGE.equals(mLanguage)) {
598                 return FullNameStyle.CHINESE;
599             } else {
600                 return FullNameStyle.WESTERN;
601             }
602         } else if (nameStyle == FullNameStyle.CJK) {
603             if (JAPANESE_LANGUAGE.equals(mLanguage)) {
604                 return FullNameStyle.JAPANESE;
605             } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
606                 return FullNameStyle.KOREAN;
607             } else {
608                 return FullNameStyle.CHINESE;
609             }
610         }
611         return nameStyle;
612     }
613 
614     /**
615      * Parses the first word from the name if it is a prefix.
616      */
parsePrefix(Name name, NameTokenizer tokens)617     private void parsePrefix(Name name, NameTokenizer tokens) {
618         if (tokens.mStartPointer == tokens.mEndPointer) {
619             return;
620         }
621 
622         String firstToken = tokens.mTokens[tokens.mStartPointer];
623         if (mPrefixesSet.contains(firstToken.toUpperCase())) {
624             name.prefix = firstToken;
625             tokens.mStartPointer++;
626         }
627     }
628 
629     /**
630      * Parses the last word(s) from the name if it is a suffix.
631      */
parseSuffix(Name name, NameTokenizer tokens)632     private void parseSuffix(Name name, NameTokenizer tokens) {
633         if (tokens.mStartPointer == tokens.mEndPointer) {
634             return;
635         }
636 
637         String lastToken = tokens.mTokens[tokens.mEndPointer - 1];
638         if (lastToken.length() > mMaxSuffixLength) {
639             return;
640         }
641 
642         String normalized = lastToken.toUpperCase();
643         if (mSuffixesSet.contains(normalized)) {
644             name.suffix = lastToken;
645             tokens.mEndPointer--;
646             return;
647         }
648 
649         if (tokens.hasDot(tokens.mEndPointer - 1)) {
650             lastToken += '.';
651         }
652         normalized += ".";
653 
654         // Take care of suffixes like M.D. and D.D.S.
655         int pos = tokens.mEndPointer - 1;
656         while (normalized.length() <= mMaxSuffixLength) {
657 
658             if (mSuffixesSet.contains(normalized)) {
659                 name.suffix = lastToken;
660                 tokens.mEndPointer = pos;
661                 return;
662             }
663 
664             if (pos == tokens.mStartPointer) {
665                 break;
666             }
667 
668             pos--;
669             if (tokens.hasDot(pos)) {
670                 lastToken = tokens.mTokens[pos] + "." + lastToken;
671             } else {
672                 lastToken = tokens.mTokens[pos] + " " + lastToken;
673             }
674 
675             normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized;
676         }
677     }
678 
parseLastName(Name name, NameTokenizer tokens)679     private void parseLastName(Name name, NameTokenizer tokens) {
680         if (tokens.mStartPointer == tokens.mEndPointer) {
681             return;
682         }
683 
684         // If the first word is followed by a comma, assume that it's the family name
685         if (tokens.hasComma(tokens.mStartPointer)) {
686            name.familyName = tokens.mTokens[tokens.mStartPointer];
687            tokens.mStartPointer++;
688            return;
689         }
690 
691         // If the second word is followed by a comma and the first word
692         // is a last name prefix as in "de Sade" and "von Cliburn", treat
693         // the first two words as the family name.
694         if (tokens.mStartPointer + 1 < tokens.mEndPointer
695                 && tokens.hasComma(tokens.mStartPointer + 1)
696                 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) {
697             String familyNamePrefix = tokens.mTokens[tokens.mStartPointer];
698             if (tokens.hasDot(tokens.mStartPointer)) {
699                 familyNamePrefix += '.';
700             }
701             name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1];
702             tokens.mStartPointer += 2;
703             return;
704         }
705 
706         // Finally, assume that the last word is the last name
707         name.familyName = tokens.mTokens[tokens.mEndPointer - 1];
708         tokens.mEndPointer--;
709 
710         // Take care of last names like "de Sade" and "von Cliburn"
711         if ((tokens.mEndPointer - tokens.mStartPointer) > 0) {
712             String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1];
713             if (isFamilyNamePrefix(lastNamePrefix)) {
714                 if (tokens.hasDot(tokens.mEndPointer - 1)) {
715                     lastNamePrefix += '.';
716                 }
717                 name.familyName = lastNamePrefix + " " + name.familyName;
718                 tokens.mEndPointer--;
719             }
720         }
721     }
722 
723     /**
724      * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de"
725      */
isFamilyNamePrefix(String word)726     private boolean isFamilyNamePrefix(String word) {
727         final String normalized = word.toUpperCase();
728 
729         return mLastNamePrefixesSet.contains(normalized)
730                 || mLastNamePrefixesSet.contains(normalized + ".");
731     }
732 
733 
parseMiddleName(Name name, NameTokenizer tokens)734     private void parseMiddleName(Name name, NameTokenizer tokens) {
735         if (tokens.mStartPointer == tokens.mEndPointer) {
736             return;
737         }
738 
739         if ((tokens.mEndPointer - tokens.mStartPointer) > 1) {
740             if ((tokens.mEndPointer - tokens.mStartPointer) == 2
741                     || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2].
742                             toUpperCase())) {
743                 name.middleName = tokens.mTokens[tokens.mEndPointer - 1];
744                 if (tokens.hasDot(tokens.mEndPointer - 1)) {
745                     name.middleName += '.';
746                 }
747                 tokens.mEndPointer--;
748             }
749         }
750     }
751 
parseGivenNames(Name name, NameTokenizer tokens)752     private void parseGivenNames(Name name, NameTokenizer tokens) {
753         if (tokens.mStartPointer == tokens.mEndPointer) {
754             return;
755         }
756 
757         if ((tokens.mEndPointer - tokens.mStartPointer) == 1) {
758             name.givenNames = tokens.mTokens[tokens.mStartPointer];
759         } else {
760             StringBuilder sb = new StringBuilder();
761             for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) {
762                 if (i != tokens.mStartPointer) {
763                     sb.append(' ');
764                 }
765                 sb.append(tokens.mTokens[i]);
766                 if (tokens.hasDot(i)) {
767                     sb.append('.');
768                 }
769             }
770             name.givenNames = sb.toString();
771         }
772     }
773 
774     /**
775      * Makes the best guess at the expected full name style based on the character set
776      * used in the supplied name.  If the phonetic name is also supplied, tries to
777      * differentiate between Chinese, Japanese and Korean based on the alphabet used
778      * for the phonetic name.
779      */
guessNameStyle(Name name)780     public void guessNameStyle(Name name) {
781         guessFullNameStyle(name);
782         guessPhoneticNameStyle(name);
783         name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle,
784                 name.phoneticNameStyle);
785     }
786 
787     /**
788      * Updates the display name style according to the phonetic name style if we
789      * were unsure about display name style based on the name components, but
790      * phonetic name makes it more definitive.
791      */
getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle)792     public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) {
793         if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
794             if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) {
795                 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) {
796                     return FullNameStyle.JAPANESE;
797                 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) {
798                     return FullNameStyle.KOREAN;
799                 }
800                 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) {
801                     return FullNameStyle.CHINESE;
802                 }
803             }
804         }
805         return nameStyle;
806     }
807 
808     /**
809      * Makes the best guess at the expected full name style based on the character set
810      * used in the supplied name.
811      */
guessFullNameStyle(NameSplitter.Name name)812     private void guessFullNameStyle(NameSplitter.Name name) {
813         if (name.fullNameStyle != FullNameStyle.UNDEFINED) {
814             return;
815         }
816 
817         int bestGuess = guessFullNameStyle(name.givenNames);
818         // A mix of Hanzi and latin chars are common in China, so we have to go through all names
819         // if the name is not JANPANESE or KOREAN.
820         if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK
821                 && bestGuess != FullNameStyle.WESTERN) {
822             name.fullNameStyle = bestGuess;
823             return;
824         }
825 
826         int guess = guessFullNameStyle(name.familyName);
827         if (guess != FullNameStyle.UNDEFINED) {
828             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
829                 name.fullNameStyle = guess;
830                 return;
831             }
832             bestGuess = guess;
833         }
834 
835         guess = guessFullNameStyle(name.middleName);
836         if (guess != FullNameStyle.UNDEFINED) {
837             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
838                 name.fullNameStyle = guess;
839                 return;
840             }
841             bestGuess = guess;
842         }
843 
844         name.fullNameStyle = bestGuess;
845     }
846 
guessFullNameStyle(String name)847     public int guessFullNameStyle(String name) {
848         if (name == null) {
849             return FullNameStyle.UNDEFINED;
850         }
851 
852         int nameStyle = FullNameStyle.UNDEFINED;
853         int length = name.length();
854         int offset = 0;
855         while (offset < length) {
856             int codePoint = Character.codePointAt(name, offset);
857             if (Character.isLetter(codePoint)) {
858                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
859 
860                 if (!isLatinUnicodeBlock(unicodeBlock)) {
861 
862                     if (isCJKUnicodeBlock(unicodeBlock)) {
863                         // We don't know if this is Chinese, Japanese or Korean -
864                         // trying to figure out by looking at other characters in the name
865                         return guessCJKNameStyle(name, offset + Character.charCount(codePoint));
866                     }
867 
868                     if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
869                         return FullNameStyle.JAPANESE;
870                     }
871 
872                     if (isKoreanUnicodeBlock(unicodeBlock)) {
873                         return FullNameStyle.KOREAN;
874                     }
875                 }
876                 nameStyle = FullNameStyle.WESTERN;
877             }
878             offset += Character.charCount(codePoint);
879         }
880         return nameStyle;
881     }
882 
guessCJKNameStyle(String name, int offset)883     private int guessCJKNameStyle(String name, int offset) {
884         int length = name.length();
885         while (offset < length) {
886             int codePoint = Character.codePointAt(name, offset);
887             if (Character.isLetter(codePoint)) {
888                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
889                 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
890                     return FullNameStyle.JAPANESE;
891                 }
892                 if (isKoreanUnicodeBlock(unicodeBlock)) {
893                     return FullNameStyle.KOREAN;
894                 }
895             }
896             offset += Character.charCount(codePoint);
897         }
898 
899         return FullNameStyle.CJK;
900     }
901 
guessPhoneticNameStyle(NameSplitter.Name name)902     private void guessPhoneticNameStyle(NameSplitter.Name name) {
903         if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
904             return;
905         }
906 
907         int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName);
908         if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) {
909             name.phoneticNameStyle = bestGuess;
910             return;
911         }
912 
913         int guess = guessPhoneticNameStyle(name.phoneticGivenName);
914         if (guess != FullNameStyle.UNDEFINED) {
915             if (guess != FullNameStyle.CJK) {
916                 name.phoneticNameStyle = guess;
917                 return;
918             }
919             bestGuess = guess;
920         }
921 
922         guess = guessPhoneticNameStyle(name.phoneticMiddleName);
923         if (guess != FullNameStyle.UNDEFINED) {
924             if (guess != FullNameStyle.CJK) {
925                 name.phoneticNameStyle = guess;
926                 return;
927             }
928             bestGuess = guess;
929         }
930     }
931 
guessPhoneticNameStyle(String name)932     public int guessPhoneticNameStyle(String name) {
933         if (name == null) {
934             return PhoneticNameStyle.UNDEFINED;
935         }
936 
937         int nameStyle = PhoneticNameStyle.UNDEFINED;
938         int length = name.length();
939         int offset = 0;
940         while (offset < length) {
941             int codePoint = Character.codePointAt(name, offset);
942             if (Character.isLetter(codePoint)) {
943                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
944                 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
945                     return PhoneticNameStyle.JAPANESE;
946                 }
947                 if (isKoreanUnicodeBlock(unicodeBlock)) {
948                     return PhoneticNameStyle.KOREAN;
949                 }
950                 if (isLatinUnicodeBlock(unicodeBlock)) {
951                     return PhoneticNameStyle.PINYIN;
952                 }
953             }
954             offset += Character.charCount(codePoint);
955         }
956 
957         return nameStyle;
958     }
959 
isLatinUnicodeBlock(UnicodeBlock unicodeBlock)960     private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) {
961         return unicodeBlock == UnicodeBlock.BASIC_LATIN ||
962                 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT ||
963                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A ||
964                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B ||
965                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL;
966     }
967 
isCJKUnicodeBlock(UnicodeBlock block)968     private static boolean isCJKUnicodeBlock(UnicodeBlock block) {
969         return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
970                 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
971                 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
972                 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
973                 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT
974                 || block == UnicodeBlock.CJK_COMPATIBILITY
975                 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS
976                 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
977                 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
978     }
979 
isKoreanUnicodeBlock(UnicodeBlock unicodeBlock)980     private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) {
981         return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES ||
982                 unicodeBlock == UnicodeBlock.HANGUL_JAMO ||
983                 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO;
984     }
985 
isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock)986     private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) {
987         return unicodeBlock == UnicodeBlock.KATAKANA ||
988                 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS ||
989                 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS ||
990                 unicodeBlock == UnicodeBlock.HIRAGANA;
991     }
992 }
993