1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Author: Mark Davis 7 ********************************************************************** 8 */ 9 package org.unicode.cldr.util; 10 11 import java.util.Collection; 12 import java.util.Collections; 13 import java.util.EnumSet; 14 import java.util.Iterator; 15 import java.util.List; 16 import java.util.Locale; 17 import java.util.Map; 18 import java.util.Map.Entry; 19 import java.util.NoSuchElementException; 20 import java.util.Set; 21 import java.util.StringTokenizer; 22 import java.util.TreeMap; 23 import java.util.TreeSet; 24 import java.util.regex.Pattern; 25 26 import org.unicode.cldr.tool.LikelySubtags; 27 28 import com.google.common.base.CharMatcher; 29 import com.google.common.base.Joiner; 30 import com.google.common.base.Splitter; 31 import com.google.common.collect.ImmutableList; 32 import com.google.common.collect.ImmutableMap; 33 import com.ibm.icu.impl.Relation; 34 import com.ibm.icu.impl.Row.R2; 35 import com.ibm.icu.text.UnicodeSet; 36 37 public class LanguageTagParser { 38 /** 39 * @return Returns the language, or "" if none. 40 */ getLanguage()41 public String getLanguage() { 42 return language; 43 } 44 45 /** 46 * @return Returns the script, or "" if none. 47 */ getScript()48 public String getScript() { 49 return script; 50 } 51 52 /** 53 * @return Returns the region, or "" if none. 54 */ getRegion()55 public String getRegion() { 56 return region; 57 } 58 59 /** 60 * @return Returns the variants. 61 */ getVariants()62 public List<String> getVariants() { 63 return ImmutableList.copyOf(variants); 64 } 65 66 /** 67 * @return Returns the grandfathered flag 68 */ isGrandfathered()69 public boolean isGrandfathered() { 70 return grandfathered; 71 } 72 73 /** 74 * @return Returns the extensions. 75 */ 76 @Deprecated getExtensions()77 public Map<String, String> getExtensions() { 78 return OutputOption.ICU.convert(extensions); 79 } 80 81 /** 82 * @return Returns the localeExtensions. 83 */ 84 @Deprecated getLocaleExtensions()85 public Map<String, String> getLocaleExtensions() { 86 return OutputOption.ICU.convert(localeExtensions); 87 } 88 89 /** 90 * @return Returns the extensions. 91 */ getExtensionsDetailed()92 public Map<String, List<String>> getExtensionsDetailed() { 93 return ImmutableMap.copyOf(extensions); 94 } 95 96 /** 97 * @return Returns the localeExtensions. 98 */ getLocaleExtensionsDetailed()99 public Map<String, List<String>> getLocaleExtensionsDetailed() { 100 return ImmutableMap.copyOf(localeExtensions); 101 } 102 103 /** 104 * @return Returns the original, preparsed language tag 105 */ getOriginal()106 public String getOriginal() { 107 return original; 108 } 109 110 /** 111 * @return Returns the language-script (or language) part of a tag. 112 */ getLanguageScript()113 public String getLanguageScript() { 114 if (script.length() != 0) return language + "_" + script; 115 return language; 116 } 117 118 /** 119 * @param in 120 * Collection of language tag strings 121 * @return Returns each of the language-script tags in the collection. 122 */ getLanguageScript(Collection<String> in)123 public static Set<String> getLanguageScript(Collection<String> in) { 124 return getLanguageAndScript(in, null); 125 } 126 127 /** 128 * @param in 129 * Collection of language tag strings 130 * @return Returns each of the language-script tags in the collection. 131 */ getLanguageAndScript(Collection<String> in, Set<String> output)132 public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) { 133 if (output == null) output = new TreeSet<String>(); 134 LanguageTagParser lparser = new LanguageTagParser(); 135 for (Iterator<String> it = in.iterator(); it.hasNext();) { 136 output.add(lparser.set(it.next()).getLanguageScript()); 137 } 138 return output; 139 } 140 141 // private fields 142 143 private String original; 144 private boolean grandfathered = false; 145 private String language; 146 private String script; 147 private String region; 148 private Set<String> variants = new TreeSet<String>(); 149 private Map<String, List<String>> extensions = new TreeMap<String, List<String>>(); // use tree map 150 private Map<String, List<String>> localeExtensions = new TreeMap<String, List<String>>(); 151 152 private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze(); 153 private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze(); 154 private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze(); 155 private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze(); 156 private static final UnicodeSet X = new UnicodeSet("[xX]").freeze(); 157 private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze(); 158 private static StandardCodes standardCodes = StandardCodes.make(); 159 private static final Set<String> grandfatheredCodes = standardCodes.getAvailableCodes("grandfathered"); 160 private static final String separator = "-_"; // '-' alone for 3066bis language tags 161 private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze(); 162 private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator)); 163 private static final Splitter SPLIT_COLON = Splitter.on(';'); 164 private static final Splitter SPLIT_EQUAL = Splitter.on('='); 165 private static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 166 private static final Relation<R2<String, String>, String> BCP47_ALIASES = SDI.getBcp47Aliases(); 167 168 /** 169 * Parses out a language tag, setting a number of fields that can subsequently be retrieved. 170 * If a private-use field is found, it is returned as the last extension.<br> 171 * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see 172 * isValid. 173 * 174 * @param languageTag 175 * @return 176 */ set(String languageTag)177 public LanguageTagParser set(String languageTag) { 178 if (languageTag.length() == 0) { 179 throw new IllegalArgumentException("Language tag cannot be empty"); 180 } 181 languageTag = languageTag.toLowerCase(Locale.ROOT); 182 183 // clear everything out 184 language = region = script = ""; 185 grandfathered = false; 186 variants.clear(); 187 extensions.clear(); 188 localeExtensions.clear(); 189 original = languageTag; 190 int localeExtensionsPosition = languageTag.indexOf('@'); 191 if (localeExtensionsPosition >= 0) { 192 final String localeExtensionsString = languageTag.substring(localeExtensionsPosition + 1); 193 for (String keyValue : SPLIT_COLON.split(localeExtensionsString)) { 194 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator(); 195 final String key = keyValuePair.next(); 196 final String value = keyValuePair.next(); 197 if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) { 198 throwError(keyValue, "Invalid key/value pair"); 199 } 200 localeExtensions.put(key, SPLIT_BAR.splitToList(value)); 201 } 202 languageTag = languageTag.substring(0, localeExtensionsPosition); 203 } 204 205 // first test for grandfathered 206 if (grandfatheredCodes.contains(languageTag)) { 207 language = languageTag; 208 grandfathered = true; 209 return this; 210 } 211 212 // each time we fetch a token, we check for length from 1..8, and all alphanum 213 StringTokenizer st = new StringTokenizer(languageTag, separator); 214 String subtag; 215 try { 216 subtag = getSubtag(st); 217 } catch (Exception e1) { 218 throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1); 219 } 220 221 // check for private use (x-...) and return if so 222 if (subtag.equalsIgnoreCase("x")) { 223 getExtension(subtag, st, 1); 224 return this; 225 } 226 227 // check that language subtag is valid 228 if (!ALPHA.containsAll(subtag) || subtag.length() < 2) { 229 throwError(subtag, "Invalid language subtag"); 230 } 231 try { // The try block is to catch the out-of-tokens case. Easier than checking each time. 232 language = subtag; 233 subtag = getSubtag(st); // prepare for next 234 235 // check for script, 4 letters 236 if (subtag.length() == 4 && ALPHA.containsAll(subtag)) { 237 script = subtag; 238 script = script.substring(0, 1).toUpperCase(Locale.ROOT) 239 + script.substring(1); 240 subtag = getSubtag(st); // prepare for next 241 } 242 243 // check for region, 2 letters or 3 digits 244 if (subtag.length() == 2 && ALPHA.containsAll(subtag) 245 || subtag.length() == 3 && DIGIT.containsAll(subtag)) { 246 region = subtag.toUpperCase(Locale.ENGLISH); 247 subtag = getSubtag(st); // prepare for next 248 } 249 250 // get variants: length > 4 or len=4 & starts with digit 251 while (isValidVariant(subtag)) { 252 variants.add(subtag); 253 subtag = getSubtag(st); // prepare for next 254 } 255 256 // get extensions: singleton '-' subtag (2-8 long) 257 while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) { 258 subtag = getExtension(subtag, st, 2); 259 if (subtag == null) return this; // done 260 } 261 262 if (subtag.equalsIgnoreCase("x")) { 263 getExtension(subtag, st, 1); 264 return this; 265 } 266 267 // if we make it to this point, then we have an error 268 throwError(subtag, "Illegal subtag"); 269 270 } catch (NoSuchElementException e) { 271 // this exception just means we ran out of tokens. That's ok, so we just return. 272 } 273 return this; 274 } 275 isValidVariant(String subtag)276 private boolean isValidVariant(String subtag) { 277 return subtag != null && ALPHANUM.containsAll(subtag) 278 && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0))); 279 } 280 281 /** 282 * 283 * @return true iff the language tag validates 284 */ isValid()285 public boolean isValid() { 286 if (grandfathered) return true; // don't need further checking, since we already did so when parsing 287 if (!validates(language, "language")) return false; 288 if (!validates(script, "script")) return false; 289 if (!validates(region, "territory")) return false; 290 for (Iterator<String> it = variants.iterator(); it.hasNext();) { 291 if (!validates(it.next(), "variant")) return false; 292 } 293 return true; // passed the gauntlet 294 } 295 296 public enum Status { 297 WELL_FORMED, VALID, CANONICAL, MINIMAL 298 } 299 getStatus(Set<String> errors)300 public Status getStatus(Set<String> errors) { 301 errors.clear(); 302 if (!isValid()) { 303 return Status.WELL_FORMED; 304 // TODO, check the bcp47 extension codes also 305 } 306 Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo(); 307 Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language"); 308 309 if (aliasInfo.get("language").containsKey(language)) { 310 errors.add("Non-canonical language: " + language); 311 } 312 Map<String, String> lstrInfo = languageInfo.get(language); 313 if (lstrInfo != null) { 314 String scope = lstrInfo.get("Scope"); 315 if ("collection".equals(scope)) { 316 errors.add("Collection language: " + language); 317 } 318 } 319 if (aliasInfo.get("script").containsKey(script)) { 320 errors.add("Non-canonical script: " + script); 321 } 322 if (aliasInfo.get("territory").containsKey(region)) { 323 errors.add("Non-canonical region: " + region); 324 } 325 if (!errors.isEmpty()) { 326 return Status.VALID; 327 } 328 String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region); 329 String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false); 330 if (minimized == null) { 331 errors.add("No minimal data for:" + tag); 332 if (script.isEmpty() && region.isEmpty()) { 333 return Status.MINIMAL; 334 } else { 335 return Status.CANONICAL; 336 } 337 } 338 if (!tag.equals(minimized)) { 339 errors.add("Not minimal:" + tag + "-->" + minimized); 340 return Status.CANONICAL; 341 } 342 return Status.MINIMAL; 343 } 344 345 /** 346 * @param subtag 347 * @param type 348 * @return true if the subtag is empty, or if it is in the registry 349 */ validates(String subtag, String type)350 private boolean validates(String subtag, String type) { 351 return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag); 352 } 353 354 /** 355 * Internal method 356 * 357 * @param minLength 358 * TODO 359 */ getExtension(String subtag, StringTokenizer st, int minLength)360 private String getExtension(String subtag, StringTokenizer st, int minLength) { 361 final String key = subtag; 362 if (extensions.containsKey(key)) { 363 throwError(subtag, "Can't have two extensions with the same key"); 364 } 365 if (!st.hasMoreElements()) { 366 throwError(subtag, "Private Use / Extension requires subsequent subtag"); 367 } 368 ImmutableList.Builder<String> result = ImmutableList.builder(); 369 try { 370 while (st.hasMoreElements()) { 371 subtag = getSubtag(st); 372 if (subtag.length() < minLength) { 373 return subtag; 374 } 375 result.add(subtag); 376 } 377 return null; 378 } finally { 379 extensions.put(key, result.build()); 380 } 381 } 382 383 /** 384 * Internal method 385 */ getSubtag(StringTokenizer st)386 private String getSubtag(StringTokenizer st) { 387 String result = st.nextToken(); 388 if (result.length() < 1 || result.length() > 8) { 389 throwError(result, "Illegal length (must be 1..8)"); 390 } 391 if (!ALPHANUM.containsAll(result)) { 392 throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")"); 393 } 394 return result; 395 } 396 397 /** 398 * Internal method 399 */ throwError(String subtag, String errorText)400 private void throwError(String subtag, String errorText) { 401 throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original); 402 } 403 setRegion(String region)404 public LanguageTagParser setRegion(String region) { 405 this.region = region; 406 return this; 407 } 408 setScript(String script)409 public LanguageTagParser setScript(String script) { 410 this.script = script; 411 return this; 412 } 413 414 public enum OutputOption { 415 ICU('_'), BCP47('-'); 416 final char separator; 417 final Joiner joiner; 418 OutputOption(char separator)419 private OutputOption(char separator) { 420 this.separator = separator; 421 joiner = Joiner.on(separator); 422 } 423 convert(Map<String, List<String>> mapToList)424 public Map<String, String> convert(Map<String, List<String>> mapToList) { 425 if (mapToList.isEmpty()) { 426 return Collections.emptyMap(); 427 } 428 ImmutableMap.Builder<String, String> builder = ImmutableMap.builder(); 429 for (Entry<String, List<String>> entry : mapToList.entrySet()) { 430 builder.put(entry.getKey(), joiner.join(entry.getValue())); 431 } 432 return builder.build(); 433 } 434 } 435 toString()436 public String toString() { 437 return toString(OutputOption.ICU); 438 } 439 toString(OutputOption oo)440 public String toString(OutputOption oo) { 441 StringBuilder result = new StringBuilder(language); // optimize for the simple cases 442 if (this.script.length() != 0) result.append(oo.separator).append(script); 443 if (this.region.length() != 0) result.append(oo.separator).append(region); 444 if (this.variants.size() != 0) { 445 for (String variant : variants) { 446 result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT)); 447 } 448 } 449 if (this.extensions.size() != 0) { 450 for (Entry<String, List<String>> extension : extensions.entrySet()) { 451 String key = extension.getKey(); 452 String value = oo.joiner.join(extension.getValue()); 453 result.append(oo.separator).append(key) 454 .append(oo.separator).append(value); 455 } 456 } 457 if (this.localeExtensions.size() != 0) { 458 if (oo == OutputOption.BCP47) { 459 throw new IllegalArgumentException("Cannot represent as BCP47 without canonicalizing first"); 460 } 461 result.append('@'); 462 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 463 String key = extension.getKey(); 464 String value = oo.joiner.join(extension.getValue()); 465 result.append(oo != OutputOption.ICU ? key : key.toUpperCase(Locale.ROOT)) 466 .append('=').append(oo != OutputOption.ICU ? value : value.toUpperCase(Locale.ROOT)); 467 } 468 } 469 return result.toString(); 470 } 471 472 /** 473 * Return just the language, script, and region (no variants or extensions) 474 * @return 475 */ toLSR()476 public String toLSR() { 477 String result = language; // optimize for the simple cases 478 if (this.script.length() != 0) result += "_" + script; 479 if (this.region.length() != 0) result += "_" + region; 480 return result; 481 } 482 483 public enum Fields { 484 LANGUAGE, SCRIPT, REGION, VARIANTS 485 }; 486 487 public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT)); 488 public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION)); 489 public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, 490 Fields.SCRIPT, Fields.REGION)); 491 toString(Set<Fields> selection)492 public String toString(Set<Fields> selection) { 493 String result = language; 494 if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script; 495 if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region; 496 if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { 497 for (String variant : (Collection<String>) variants) { 498 result += "_" + variant; 499 } 500 } 501 return result; 502 } 503 setLanguage(String language)504 public LanguageTagParser setLanguage(String language) { 505 if (SEPARATORS.containsSome(language)) { 506 String oldScript = script; 507 String oldRegion = region; 508 Set<String> oldVariants = variants; 509 set(language); 510 if (script.length() == 0) { 511 script = oldScript; 512 } 513 if (region.length() == 0) { 514 region = oldRegion; 515 } 516 if (oldVariants.size() != 0) { 517 variants = oldVariants; 518 } 519 } else { 520 this.language = language; 521 } 522 return this; 523 } 524 setLocaleExtensions(Map<String, String> localeExtensions)525 public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) { 526 this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE); 527 return this; 528 } 529 setVariants(Collection<String> newVariants)530 public LanguageTagParser setVariants(Collection<String> newVariants) { 531 for (String variant : newVariants) { 532 if (!isValidVariant(variant)) { 533 throw new IllegalArgumentException("Illegal variant: " + variant); 534 } 535 } 536 variants.clear(); 537 variants.addAll(newVariants); 538 return this; 539 } 540 541 static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?"); 542 setExtensions(Map<String, String> newExtensions)543 public LanguageTagParser setExtensions(Map<String, String> newExtensions) { 544 this.extensions = expandMap(newExtensions, 2, 8); 545 return this; 546 } 547 getSimpleParent(String s)548 public static String getSimpleParent(String s) { 549 int lastBar = s.lastIndexOf('_'); 550 return lastBar >= 0 ? s.substring(0, lastBar) : ""; 551 } 552 expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength)553 private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) { 554 if (newLocaleExtensions.isEmpty()) { 555 return Collections.emptyMap(); 556 } 557 ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder(); 558 for (Entry<String, String> entry : newLocaleExtensions.entrySet()) { 559 result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength)); 560 } 561 return result.build(); 562 } 563 split(String value, int minLength, int maxLength)564 private List<String> split(String value, int minLength, int maxLength) { 565 List<String> values = SPLIT_BAR.splitToList(value); 566 for (String s : values) { 567 if (s.length() < minLength || s.length() > maxLength) { 568 throw new IllegalArgumentException("Illegal subtag length for: " + s); 569 } 570 if (!ALPHANUM.contains(s)) { 571 throw new IllegalArgumentException("Illegal locale character in: " + s); 572 } 573 } 574 return values; 575 } 576 }