1 package org.unicode.cldr.tool; 2 3 import java.util.Arrays; 4 import java.util.Collections; 5 import java.util.HashSet; 6 import java.util.LinkedHashMap; 7 import java.util.List; 8 import java.util.Locale; 9 import java.util.Map; 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 13 import org.unicode.cldr.util.PatternCache; 14 15 /** 16 * Parse Locales, extended to BCP 47 and CLDR. Also normalizes the case of the results. 17 * Only does syntactic parse: does not replace deprecated elements; does not check for validity. 18 * Will throw IllegalArgumentException for duplicate variants and extensions. 19 * 20 * @author markdavis 21 */ 22 class SimpleLocaleParser { 23 // mechanically generated regex -- don't worry about trying to read it! 24 // if we want to allow multiple --, change [-_] into [-_]+ 25 private static final Pattern rootPattern = Pattern.compile( 26 "(?:" 27 + 28 " (?: ( [a-z]{2,8} )" 29 + // language 30 " (?: [-_] ( [a-z]{4} ) )?" 31 + // script 32 " (?: [-_] ( [a-z]{2} | [0-9]{3} ) )?" 33 + // region 34 " (?: [-_] ( (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) (?: [-_] (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) )* ) )?" 35 + // variant(s) 36 " (?: [-_] ( [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ (?: [-_] [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ )* ) )?" 37 + // extensions 38 " (?: [-_] ( x (?: [-_] [a-z 0-9]{1,8} )+ ) )? )" 39 + // private use 40 " | ( x (?: [-_] [a-z 0-9]{1,8} )+ )" 41 + // private use 42 " | ( en [-_] GB [-_] oed" 43 + // grandfathered gorp 44 " | i [-_] (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )" 45 + 46 " | no [-_] (?: bok | nyn )" + 47 " | sgn [-_] (?: BE [-_] (?: fr | nl) | CH [-_] de )" + 48 " | zh [-_] (?: cmn (?: [-_] Hans | [-_] Hant )? | gan | min (?: [-_] nan)? | wuu | yue ) ) )" + 49 " (?: \\@ ((?: [a-z 0-9]+ \\= [a-z 0-9]+) (?: \\; (?: [a-z 0-9]+ \\= [a-z 0-9]+))*))?" + // CLDR/ICU 50 // keywords 51 "", 52 Pattern.COMMENTS | Pattern.CASE_INSENSITIVE); // TODO change above to be lowercase, since source is 53 // already when we compare 54 // Other regex patterns for splitting apart lists of items detected above. 55 private static final Pattern variantSeparatorPattern = PatternCache.get("[-_]"); 56 private static final Pattern extensionPattern = Pattern.compile( 57 "([a-z]) [-_] ( [a-z 0-9]{2,8} (?:[-_] [a-z 0-9]{2,8})* )", Pattern.COMMENTS); 58 private static final Pattern privateUsePattern = Pattern.compile( 59 "(x) [-_] ( [a-z 0-9]{1,8} (?:[-_] [a-z 0-9]{1,8})* )", Pattern.COMMENTS); 60 private static final Pattern keywordPattern = Pattern.compile("([a-z 0-9]+) \\= ([a-z 0-9]+)", Pattern.COMMENTS); 61 62 /** 63 * The fields set by set(). 64 */ 65 private String language; 66 private String script; 67 private String region; 68 private List<String> variants; 69 private Map<String, String> extensions; 70 71 /** 72 * Set the object to the source. 73 * <p> 74 * Example (artificially complicated): 75 * 76 * <pre> 77 * myParser.set("zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic"); 78 * String language = myParser.getLanguage(); 79 * </pre> 80 * 81 * @param source 82 * @return 83 */ set(String source)84 public boolean set(String source) { 85 source = source.toLowerCase(Locale.ENGLISH); 86 Matcher root = rootPattern.matcher(source); 87 if (!root.matches()) { 88 return false; 89 } 90 language = root.group(1); 91 if (language == null) { 92 language = root.group(8); // grandfathered 93 if (language == null) { 94 language = "und"; // placeholder for completely private use 95 } 96 } 97 script = root.group(2); 98 if (script == null) { 99 script = ""; 100 } else { 101 script = script.substring(0, 1).toUpperCase(Locale.ENGLISH) + script.substring(1); 102 } 103 region = root.group(3); 104 if (region == null) { 105 region = ""; 106 } else { 107 region = region.toUpperCase(Locale.ENGLISH); 108 } 109 final String variantList = root.group(4); 110 if (variantList == null) { 111 variants = Collections.emptyList(); 112 } else { 113 // make uppercase for compatibility with CLDR. 114 variants = Arrays.asList(variantSeparatorPattern.split(variantList.toUpperCase(Locale.ENGLISH))); 115 // check for duplicate variants 116 if (new HashSet<String>(variants).size() != variants.size()) { 117 throw new IllegalArgumentException("Duplicate variants"); 118 } 119 } 120 extensions = new LinkedHashMap<String, String>(); // group 5 are extensions, 6 is private use 121 // extensions are a bit more complicated 122 addExtensions(root.group(5), extensionPattern); 123 addExtensions(root.group(6), privateUsePattern); 124 addExtensions(root.group(7), privateUsePattern); 125 addExtensions(root.group(9), keywordPattern); 126 extensions = Collections.unmodifiableMap(extensions); 127 return true; 128 } 129 addExtensions(String item, Pattern pattern)130 private void addExtensions(String item, Pattern pattern) { 131 if (item != null) { 132 Matcher extension = pattern.matcher(item); 133 while (extension.find()) { 134 final String key = extension.group(1); 135 // check for duplicate keys 136 if (extensions.containsKey(key)) { 137 throw new IllegalArgumentException("duplicate key: " + key); 138 } 139 extensions.put(key, extension.group(2)); 140 } 141 } 142 } 143 144 /** 145 * Return BCP 47 language subtag (may be ISO registered code). 146 * If the language tag is irregular, then the entire tag is in the language field. 147 * If the entire code is private use, then the language code is "und". 148 * Examples: 149 * <table style="border-width:1; border-style:collapse"> 150 * <tr> 151 * <th>Input String</th> 152 * <th>Parsed</th> 153 * </tr> 154 * <tr> 155 * <td>zh-cmn-Hans</td> 156 * <td>{language=zh-cmn-hans, script=, country=, variants=[], keywords={}}</td> 157 * </tr> 158 * <tr> 159 * <td>i-default@abc=def</td> 160 * <td>{language=i-default, script=, country=, variants=[], keywords={abc=def}}</td> 161 * </tr> 162 * <tr> 163 * <td>x-foobar@abc=def</td> 164 * <td>{language=und, script=, country=, variants=[], keywords={x=foobar, abc=def}}</td> 165 * </tr> 166 * </table> 167 * 168 * @return language subtag, lowercased. 169 */ getLanguage()170 public String getLanguage() { 171 return language; 172 } 173 174 /** 175 * Return BCP 47 script subtag (may be ISO or UN) 176 * 177 * @return script subtag, titlecased. 178 */ getScript()179 public String getScript() { 180 return script; 181 } 182 183 /** 184 * Return BCP 47 region subtag (may be ISO or UN) 185 * 186 * @return country (region) subtag, uppercased. 187 */ getCountry()188 public String getCountry() { 189 return region; 190 } 191 192 /** 193 * Return immutable list of BCP 47 variants 194 * 195 * @return list of uppercased variants. 196 */ getVariants()197 public List<String> getVariants() { 198 return variants; 199 } 200 201 /** 202 * Return the first variant, for compatibility 203 * 204 * @return first (uppercased) variant 205 */ getVariant()206 public String getVariant() { 207 return variants.size() == 0 ? "" : variants.iterator().next(); 208 } 209 210 /** 211 * Return immutable map of key/value extensions. Includes BCP 47 extensions and private use, also locale keyword 212 * extensions. If the entire code is private use, 213 * then the language is set to "und" for consistency. 214 * <p> 215 * Example: 216 * <table style="border-width:1; border-style:collapse"> 217 * <tr> 218 * <th>Input String</th> 219 * <th>Parsed</th> 220 * </tr> 221 * <tr> 222 * <td>zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic</td> 223 * <td>{language=zh, script=Hans, country=HK, variants=[SCOUSE], keywords={a=foobar, x=a-en, collation=phonebook, 224 * calendar=islamic}}</td> 225 * </tr> 226 * </table> 227 * 228 * @return map of key/value pairs, lowercased. 229 */ getExtensions()230 public Map<String, String> getExtensions() { 231 return extensions; 232 } 233 toString()234 public String toString() { 235 return "{language=" + language 236 + ", script=" + script 237 + ", country=" + region 238 + ", variants=" + variants 239 + ", keywords=" + extensions 240 + "}"; 241 } 242 }