1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Author: Mark Davis 7 ********************************************************************** 8 */ 9 package org.unicode.cldr.util; 10 11 import java.util.ArrayList; 12 import java.util.Collection; 13 import java.util.EnumSet; 14 import java.util.Iterator; 15 import java.util.List; 16 import java.util.Set; 17 import java.util.TreeSet; 18 19 import com.ibm.icu.impl.Utility; 20 import com.ibm.icu.text.UnicodeSet; 21 22 public class LocaleIDParser { 23 /** 24 * @return Returns the language. 25 */ getLanguage()26 public String getLanguage() { 27 return language; 28 } 29 30 /** 31 * @return Returns the language. 32 */ getLanguageScript()33 public String getLanguageScript() { 34 if (script.length() != 0) return language + "_" + script; 35 return language; 36 } 37 getLanguageScript(Collection<String> in)38 public static Set<String> getLanguageScript(Collection<String> in) { 39 return getLanguageScript(in, null); 40 } 41 getLanguageScript(Collection<String> in, Set<String> output)42 public static Set<String> getLanguageScript(Collection<String> in, Set<String> output) { 43 if (output == null) output = new TreeSet<>(); 44 LocaleIDParser lparser = new LocaleIDParser(); 45 for (Iterator<String> it = in.iterator(); it.hasNext();) { 46 output.add(lparser.set(it.next()).getLanguageScript()); 47 } 48 return output; 49 } 50 51 /** 52 * @return Returns the region. 53 */ getRegion()54 public String getRegion() { 55 return region; 56 } 57 58 /** 59 * @return Returns the script. 60 */ getScript()61 public String getScript() { 62 return script; 63 } 64 65 /** 66 * @return Returns the variants. 67 */ getVariants()68 public String[] getVariants() { 69 return variants.clone(); 70 } 71 72 // TODO, update to RFC3066 73 // http://www.inter-locale.com/ID/draft-phillips-langtags-08.html 74 private String language; 75 private String script; 76 private String region; 77 private String[] variants; 78 79 static final UnicodeSet letters = new UnicodeSet("[a-zA-Z]"); 80 static final UnicodeSet digits = new UnicodeSet("[0-9]"); 81 set(String localeID)82 public LocaleIDParser set(String localeID) { 83 region = script = ""; 84 variants = new String[0]; 85 86 String[] pieces = new String[100]; // fix limitation later 87 Utility.split(localeID, '_', pieces); 88 int i = 0; 89 language = pieces[i++]; 90 if (i >= pieces.length) return this; 91 if (pieces[i].length() == 4) { 92 script = pieces[i++]; 93 if (i >= pieces.length) return this; 94 } 95 if (pieces[i].length() == 2 && letters.containsAll(pieces[i]) 96 || pieces[i].length() == 3 && digits.containsAll(pieces[i])) { 97 region = pieces[i++]; 98 if (i >= pieces.length) return this; 99 } 100 List<String> al = new ArrayList<>(); 101 while (i < pieces.length && pieces[i].length() > 0) { 102 al.add(pieces[i++]); 103 } 104 variants = new String[al.size()]; 105 al.toArray(variants); 106 return this; 107 } 108 109 /** 110 * Get the parent of a locale. If the input is "root", then return null. 111 * For example, if localeName is "fr_CA", return "fr". 112 * 113 * Only works on canonical locale names (right casing, etc.)! 114 * 115 * Formerly this function returned an empty string when localeName was "_VETTING". 116 * Now it returns "root" where it would have returned an empty string. 117 * TODO: explain "__VETTING", somehow related to SUMMARY_LOCALE. Note that 118 * CLDRLocale.process() changes "__" to "_" before this function is called. 119 * Reference: https://unicode-org.atlassian.net/browse/CLDR-13133 120 */ getParent(String localeName)121 public static String getParent(String localeName) { 122 SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 123 String explicitParent = sdi.getExplicitParentLocale(localeName); 124 if (explicitParent != null) { 125 return explicitParent; 126 } 127 int pos = localeName.lastIndexOf('_'); 128 if (pos >= 0) { 129 String truncated = localeName.substring(0, pos); 130 // if the final item is a script, and it is not the default content, then go directly to root 131 int pos2 = getScriptPosition(localeName); 132 if (pos2 > 0) { 133 String script = localeName.substring(pos + 1); 134 String defaultScript = sdi.getDefaultScript(truncated); 135 if (!script.equals(defaultScript)) { 136 return "root"; 137 } 138 } 139 if (truncated.length() == 0) { 140 return "root"; 141 } 142 return truncated; 143 } 144 if (localeName.equals("root")) { 145 return null; 146 } 147 return "root"; 148 } 149 150 /** 151 * Return the base language subtag: en_US => en, en_Latn_US => en, en => en, root => root 152 * @param localeID 153 * @return 154 */ getSimpleBaseLanguage(String localeID)155 public static String getSimpleBaseLanguage(String localeID) { 156 int pos = localeID.indexOf('_'); 157 if (pos >= 0) { 158 return localeID.substring(0,pos); 159 } 160 return localeID; 161 } 162 163 /** 164 * If the locale consists of baseLanguage+script, return the position of the separator, otherwise -1. 165 * @param s 166 */ getScriptPosition(String locale)167 public static int getScriptPosition(String locale) { 168 int pos = locale.indexOf('_'); 169 if (pos >= 0 && pos + 5 == locale.length()) { 170 int pos2 = locale.indexOf('_', pos + 1); 171 if (pos2 < 0) { 172 return pos; 173 } 174 } 175 return -1; 176 } 177 178 /** 179 * Utility to get the simple parent of a locale. If the input is "root", then the output is null. 180 * This method is similar to the getParent() method above, except that it does NOT pay any attention 181 * to the explicit parent locales information. Thus, getParent("zh_Hant") will return "root", 182 * but getSimpleParent("zh_Hant") would return "zh". 183 */ getSimpleParent(String localeName)184 public static String getSimpleParent(String localeName) { 185 int pos = localeName.lastIndexOf('_'); 186 if (pos >= 0) { 187 return localeName.substring(0, pos); 188 } 189 if (localeName.equals("root") || localeName.equals(CLDRFile.SUPPLEMENTAL_NAME)) return null; 190 return "root"; 191 } 192 setLanguage(String language)193 public LocaleIDParser setLanguage(String language) { 194 this.language = language; 195 return this; 196 } 197 setRegion(String region)198 public LocaleIDParser setRegion(String region) { 199 this.region = region; 200 return this; 201 } 202 setScript(String script)203 public LocaleIDParser setScript(String script) { 204 this.script = script; 205 return this; 206 } 207 setVariants(String[] variants)208 public LocaleIDParser setVariants(String[] variants) { 209 this.variants = variants.clone(); 210 return this; 211 } 212 213 public enum Level { 214 Language, Script, Region, Variants, Other 215 } 216 217 /** 218 * Returns an int mask indicating the level 219 * 220 * @return (2 if script is present) + (4 if region is present) + (8 if region is present) 221 */ getLevels()222 public Set<Level> getLevels() { 223 EnumSet<Level> result = EnumSet.of(Level.Language); 224 if (getScript().length() != 0) result.add(Level.Script); 225 if (getRegion().length() != 0) result.add(Level.Region); 226 if (getVariants().length != 0) result.add(Level.Variants); 227 return result; 228 } 229 getSiblings(Set<String> set)230 public Set<String> getSiblings(Set<String> set) { 231 Set<Level> myLevel = getLevels(); 232 String localeID = toString(); 233 String parentID = getParent(localeID); 234 235 String prefix = (parentID == null || "root".equals(parentID)) ? "" : parentID + "_"; 236 Set<String> siblings = new TreeSet<>(); 237 for (String id : set) { 238 if (id.startsWith(prefix) && set(id).getLevels().equals(myLevel)) { 239 siblings.add(id); 240 } 241 } 242 set(localeID); // leave in known state 243 return siblings; 244 } 245 246 @Override toString()247 public String toString() { 248 StringBuffer result = new StringBuffer(language); 249 if (script.length() != 0) result.append('_').append(script); 250 if (region.length() != 0) result.append('_').append(region); 251 if (variants != null) { 252 for (int i = 0; i < variants.length; ++i) { 253 result.append('_').append(variants[i]); 254 } 255 } 256 return result.toString(); 257 } 258 }