1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Author: Mark Davis 7 ********************************************************************** 8 */ 9 package org.unicode.cldr.util; 10 11 import java.util.ArrayList; 12 import java.util.Collection; 13 import java.util.Collections; 14 import java.util.Comparator; 15 import java.util.EnumSet; 16 import java.util.Iterator; 17 import java.util.List; 18 import java.util.Locale; 19 import java.util.Map; 20 import java.util.Map.Entry; 21 import java.util.NoSuchElementException; 22 import java.util.Set; 23 import java.util.StringTokenizer; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import java.util.regex.Pattern; 27 28 import org.unicode.cldr.tool.LikelySubtags; 29 30 import com.google.common.base.CharMatcher; 31 import com.google.common.base.Joiner; 32 import com.google.common.base.Splitter; 33 import com.google.common.collect.ImmutableList; 34 import com.google.common.collect.ImmutableMap; 35 import com.ibm.icu.impl.Row.R2; 36 import com.ibm.icu.text.UnicodeSet; 37 38 public class LanguageTagParser { 39 40 private static final Joiner HYPHEN_JOINER = Joiner.on('-'); 41 42 private static final Comparator<? super String> EXTENSION_ORDER = new Comparator<String>() { 43 44 @Override 45 public int compare(String o1, String o2) { 46 int diff = getBucket(o1) - getBucket(o2); 47 if (diff != 0) { 48 return diff; 49 } 50 return o1.compareTo(o2); 51 } 52 53 private int getBucket(String o1) { 54 switch (o1.length()) { 55 case 1: 56 return o1.charAt(0) == 't' ? 0 : 2; 57 case 2: 58 return o1.charAt(1) <= '9' ? 1 : 3; 59 default: 60 throw new IllegalArgumentException(); 61 } 62 } 63 }; 64 65 /** 66 * @return Returns the language, or "" if none. 67 */ getLanguage()68 public String getLanguage() { 69 return language; 70 } 71 72 /** 73 * @return Returns the script, or "" if none. 74 */ getScript()75 public String getScript() { 76 return script; 77 } 78 79 /** 80 * @return Returns the region, or "" if none. 81 */ getRegion()82 public String getRegion() { 83 return region; 84 } 85 86 /** 87 * @return Returns the variants. 88 */ getVariants()89 public List<String> getVariants() { 90 return ImmutableList.copyOf(variants); 91 } 92 93 /** 94 * @return True if the language tag is marked as “Type: grandfathered” in BCP 47. 95 */ isLegacy()96 public boolean isLegacy() { 97 return legacy; 98 } 99 100 /** 101 * @return Returns the extensions. 102 */ 103 @Deprecated getExtensions()104 public Map<String, String> getExtensions() { 105 return OutputOption.ICU.convert(extensions); 106 } 107 108 /** 109 * @return Returns the localeExtensions. 110 */ 111 @Deprecated getLocaleExtensions()112 public Map<String, String> getLocaleExtensions() { 113 return OutputOption.ICU.convert(localeExtensions); 114 } 115 116 /** 117 * @return Returns the extensions. 118 */ getExtensionsDetailed()119 public Map<String, List<String>> getExtensionsDetailed() { 120 return ImmutableMap.copyOf(extensions); 121 } 122 123 /** 124 * @return Returns the localeExtensions. 125 */ getLocaleExtensionsDetailed()126 public Map<String, List<String>> getLocaleExtensionsDetailed() { 127 return ImmutableMap.copyOf(localeExtensions); 128 } 129 130 /** 131 * @return Returns the original, preparsed language tag 132 */ getOriginal()133 public String getOriginal() { 134 return original; 135 } 136 137 /** 138 * @return Returns the language-script (or language) part of a tag. 139 */ getLanguageScript()140 public String getLanguageScript() { 141 if (script.length() != 0) return language + "_" + script; 142 return language; 143 } 144 145 /** 146 * @param in 147 * Collection of language tag strings 148 * @return Returns each of the language-script tags in the collection. 149 */ getLanguageScript(Collection<String> in)150 public static Set<String> getLanguageScript(Collection<String> in) { 151 return getLanguageAndScript(in, null); 152 } 153 154 /** 155 * @param in 156 * Collection of language tag strings 157 * @return Returns each of the language-script tags in the collection. 158 */ getLanguageAndScript(Collection<String> in, Set<String> output)159 public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) { 160 if (output == null) output = new TreeSet<>(); 161 LanguageTagParser lparser = new LanguageTagParser(); 162 for (Iterator<String> it = in.iterator(); it.hasNext();) { 163 output.add(lparser.set(it.next()).getLanguageScript()); 164 } 165 return output; 166 } 167 168 // private fields 169 170 private String original; 171 private boolean legacy = false; 172 private String language; 173 private String script; 174 private String region; 175 private Set<String> variants = new TreeSet<>(); 176 private Map<String, List<String>> extensions = new TreeMap<>(); // use tree map 177 private Map<String, List<String>> localeExtensions = new TreeMap<>(EXTENSION_ORDER); 178 179 private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze(); 180 private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze(); 181 private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze(); 182 private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze(); 183 private static final UnicodeSet X = new UnicodeSet("[xX]").freeze(); 184 private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze(); 185 private static StandardCodes standardCodes = StandardCodes.make(); 186 private static final Set<String> legacyCodes = standardCodes.getAvailableCodes("legacy"); 187 private static final String separator = "-_"; // '-' alone for 3066bis language tags 188 private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze(); 189 private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator)); 190 private static final Splitter SPLIT_COLON = Splitter.on(';'); 191 private static final Splitter SPLIT_EQUAL = Splitter.on('='); 192 private static SupplementalDataInfo SDI = null; // postpone assignment to avoid re-entrance of SupplementalDataInfo.getInstance 193 194 /** 195 * Parses out a language tag, setting a number of fields that can subsequently be retrieved. 196 * If a private-use field is found, it is returned as the last extension.<br> 197 * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see 198 * isValid. 199 * 200 * @param languageTag 201 * @return 202 */ set(String languageTag)203 public LanguageTagParser set(String languageTag) { 204 if (languageTag.length() == 0 || languageTag.equals("root")) { 205 // throw new IllegalArgumentException("Language tag cannot be empty"); 206 // 207 // With ICU 64 the language tag for root is normalized to empty string so we 208 // cannot throw for empty string as above. However, code here and in clients 209 // assumes a non-empty language tag, so for now just map "" or "root" to "und". 210 languageTag = "und"; 211 } else if (languageTag.startsWith("_") || languageTag.startsWith("-")) { 212 languageTag = "und" + languageTag; 213 } 214 languageTag = languageTag.toLowerCase(Locale.ROOT); 215 216 // clear everything out 217 language = region = script = ""; 218 legacy = false; 219 variants.clear(); 220 extensions.clear(); 221 localeExtensions.clear(); 222 original = languageTag; 223 int atPosition = languageTag.indexOf('@'); 224 if (atPosition >= 0) { 225 final String extensionsString = languageTag.substring(atPosition + 1).toLowerCase(Locale.ROOT); 226 for (String keyValue : SPLIT_COLON.split(extensionsString)) { 227 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator(); 228 final String key = keyValuePair.next(); 229 final String value = keyValuePair.next(); 230 if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) { 231 throwError(keyValue, "Invalid key/value pair"); 232 } 233 List<String> valueList = SPLIT_BAR.splitToList(value); 234 switch(key.length()) { 235 case 1: 236 extensions.put(key, valueList); 237 break; 238 case 2: 239 localeExtensions.put(key, valueList); 240 break; 241 default: 242 throwError(keyValue, "Invalid key/value pair"); 243 break; 244 } 245 } 246 languageTag = languageTag.substring(0, atPosition); 247 } 248 249 if (legacyCodes.contains(languageTag)) { 250 language = languageTag; 251 legacy = true; 252 return this; 253 } 254 255 // each time we fetch a token, we check for length from 1..8, and all alphanum 256 StringTokenizer st = new StringTokenizer(languageTag, separator); 257 String subtag; 258 try { 259 subtag = getSubtag(st); 260 } catch (Exception e1) { 261 throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1); 262 } 263 264 // check for private use (x-...) and return if so 265 if (subtag.equalsIgnoreCase("x")) { 266 getExtension(subtag, st, 1); 267 return this; 268 } 269 270 // check that language subtag is valid 271 if (!ALPHA.containsAll(subtag) || subtag.length() < 2) { 272 throwError(subtag, "Invalid language subtag"); 273 } 274 try { // The try block is to catch the out-of-tokens case. Easier than checking each time. 275 language = subtag; 276 subtag = getSubtag(st); // prepare for next 277 278 // check for script, 4 letters 279 if (subtag.length() == 4 && ALPHA.containsAll(subtag)) { 280 script = subtag; 281 script = script.substring(0, 1).toUpperCase(Locale.ROOT) 282 + script.substring(1); 283 subtag = getSubtag(st); // prepare for next 284 } 285 286 // check for region, 2 letters or 3 digits 287 if (subtag.length() == 2 && ALPHA.containsAll(subtag) 288 || subtag.length() == 3 && DIGIT.containsAll(subtag)) { 289 region = subtag.toUpperCase(Locale.ENGLISH); 290 subtag = getSubtag(st); // prepare for next 291 } 292 293 // get variants: length > 4 or len=4 & starts with digit 294 while (isValidVariant(subtag)) { 295 variants.add(subtag); 296 subtag = getSubtag(st); // prepare for next 297 } 298 299 // get extensions: singleton '-' subtag (2-8 long) 300 while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) { 301 subtag = getExtension(subtag, st, 2); 302 if (subtag == null) return this; // done 303 } 304 305 if (subtag.equalsIgnoreCase("x")) { 306 getExtension(subtag, st, 1); 307 return this; 308 } 309 310 // if we make it to this point, then we have an error 311 throwError(subtag, "Illegal subtag"); 312 313 } catch (NoSuchElementException e) { 314 // this exception just means we ran out of tokens. That's ok, so we just return. 315 } 316 return this; 317 } 318 isValidVariant(String subtag)319 private boolean isValidVariant(String subtag) { 320 return subtag != null && ALPHANUM.containsAll(subtag) 321 && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0))); 322 } 323 324 /** 325 * 326 * @return true iff the language tag validates 327 */ isValid()328 public boolean isValid() { 329 if (legacy) return true; // don't need further checking, since we already did so when parsing 330 if (!validates(language, "language")) return false; 331 if (!validates(script, "script")) return false; 332 if (!validates(region, "territory")) return false; 333 for (Iterator<String> it = variants.iterator(); it.hasNext();) { 334 if (!validates(it.next(), "variant")) return false; 335 } 336 return true; // passed the gauntlet 337 } 338 339 public enum Status { 340 WELL_FORMED, VALID, CANONICAL, MINIMAL 341 } 342 getStatus(Set<String> errors)343 public Status getStatus(Set<String> errors) { 344 errors.clear(); 345 if (!isValid()) { 346 return Status.WELL_FORMED; 347 // TODO, check the bcp47 extension codes also 348 } 349 350 if (SDI == null) { 351 SDI = SupplementalDataInfo.getInstance(); 352 } 353 Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo(); 354 Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language"); 355 356 if (aliasInfo.get("language").containsKey(language)) { 357 errors.add("Non-canonical language: " + language); 358 } 359 Map<String, String> lstrInfo = languageInfo.get(language); 360 if (lstrInfo != null) { 361 String scope = lstrInfo.get("Scope"); 362 if ("collection".equals(scope)) { 363 errors.add("Collection language: " + language); 364 } 365 } 366 if (aliasInfo.get("script").containsKey(script)) { 367 errors.add("Non-canonical script: " + script); 368 } 369 if (aliasInfo.get("territory").containsKey(region)) { 370 errors.add("Non-canonical region: " + region); 371 } 372 if (!errors.isEmpty()) { 373 return Status.VALID; 374 } 375 String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region); 376 String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false); 377 if (minimized == null) { 378 errors.add("No minimal data for:" + tag); 379 if (script.isEmpty() && region.isEmpty()) { 380 return Status.MINIMAL; 381 } else { 382 return Status.CANONICAL; 383 } 384 } 385 if (!tag.equals(minimized)) { 386 errors.add("Not minimal:" + tag + "-->" + minimized); 387 return Status.CANONICAL; 388 } 389 return Status.MINIMAL; 390 } 391 392 /** 393 * @param subtag 394 * @param type 395 * @return true if the subtag is empty, or if it is in the registry 396 */ validates(String subtag, String type)397 private boolean validates(String subtag, String type) { 398 return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag); 399 } 400 401 /** 402 * Internal method 403 * 404 * @param minLength 405 * TODO 406 */ getExtension(String subtag, StringTokenizer st, int minLength)407 private String getExtension(String subtag, StringTokenizer st, int minLength) { 408 String base = subtag; 409 final char extension = subtag.charAt(0); 410 if (extensions.containsKey(subtag)) { 411 throwError(subtag, "Can't have two extensions with the same key"); 412 } 413 if (!st.hasMoreElements()) { 414 throwError(subtag, "Private Use / Extension requires subsequent subtag"); 415 } 416 boolean takesSubkeys = extension == 'u' || extension == 't'; 417 boolean firstT = extension == 't'; 418 boolean haveContents = false; 419 List<String> result = new ArrayList<>(); 420 try { 421 while (st.hasMoreElements()) { 422 subtag = getSubtag(st); 423 if (subtag.length() < minLength) { 424 return subtag; 425 } 426 if (takesSubkeys 427 && subtag.length() == 2 428 && (!firstT || isTKey(subtag))) { // start new key-value pair 429 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u- 430 localeExtensions.put(base, ImmutableList.copyOf(result)); 431 haveContents = true; 432 result.clear(); 433 } 434 base = subtag; 435 continue; 436 } 437 firstT = false; 438 result.add(subtag); 439 } 440 return null; 441 } finally { 442 if (takesSubkeys) { 443 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u- 444 localeExtensions.put(base, ImmutableList.copyOf(result)); 445 haveContents = true; 446 } 447 if (!haveContents) { 448 throw new IllegalArgumentException("extension must not be empty: " + base); 449 } 450 } else { 451 if (result.isEmpty()) { 452 throw new IllegalArgumentException("extension must not be empty: " + base); 453 } 454 extensions.put(base, ImmutableList.copyOf(result)); 455 } 456 } 457 } 458 459 /** 460 * Internal method 461 */ getSubtag(StringTokenizer st)462 private String getSubtag(StringTokenizer st) { 463 String result = st.nextToken(); 464 if (result.length() < 1 || result.length() > 8) { 465 throwError(result, "Illegal length (must be 1..8)"); 466 } 467 if (!ALPHANUM.containsAll(result)) { 468 throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")"); 469 } 470 return result; 471 } 472 473 /** 474 * Internal method 475 */ throwError(String subtag, String errorText)476 private void throwError(String subtag, String errorText) { 477 throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original); 478 } 479 setRegion(String region)480 public LanguageTagParser setRegion(String region) { 481 this.region = region; 482 return this; 483 } 484 setScript(String script)485 public LanguageTagParser setScript(String script) { 486 this.script = script; 487 return this; 488 } 489 490 public enum OutputOption { 491 ICU('_'), ICU_LCVARIANT('_'), BCP47('-'); 492 final char separator; 493 final Joiner joiner; 494 OutputOption(char separator)495 private OutputOption(char separator) { 496 this.separator = separator; 497 joiner = Joiner.on(separator); 498 } 499 convert(Map<String, List<String>> mapToList)500 public Map<String, String> convert(Map<String, List<String>> mapToList) { 501 if (mapToList.isEmpty()) { 502 return Collections.emptyMap(); 503 } 504 ImmutableMap.Builder<String, String> builder = ImmutableMap.builder(); 505 for (Entry<String, List<String>> entry : mapToList.entrySet()) { 506 builder.put(entry.getKey(), joiner.join(entry.getValue())); 507 } 508 return builder.build(); 509 } 510 } 511 512 @Override toString()513 public String toString() { 514 return toString(OutputOption.ICU); 515 } 516 toString(OutputOption oo)517 public String toString(OutputOption oo) { 518 StringBuilder result = new StringBuilder(language); // optimize for the simple cases 519 if (this.script.length() != 0) result.append(oo.separator).append(script); 520 if (this.region.length() != 0) result.append(oo.separator).append(region); 521 if (this.variants.size() != 0) { 522 for (String variant : variants) { 523 result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT)); 524 } 525 } 526 boolean haveAt = false; 527 boolean needSep = false; 528 529 StringBuilder extensionsAfterU = null; 530 StringBuilder extensionX = null; 531 if (this.extensions.size() != 0) { 532 StringBuilder target = result; 533 for (Entry<String, List<String>> extension : extensions.entrySet()) { 534 String key = extension.getKey(); 535 String value = oo.joiner.join(extension.getValue()); 536 switch (key) { 537 case "v": 538 case "w": 539 case "y": 540 case "z": 541 if (extensionsAfterU == null) { 542 extensionsAfterU = new StringBuilder(); 543 } 544 target = extensionsAfterU; 545 break; 546 case "x": 547 if (extensionX == null) { 548 extensionX = new StringBuilder(); 549 } 550 target = extensionX; 551 break; 552 default: 553 // no action; we already have target set right for earlier items. 554 } 555 if (oo == OutputOption.BCP47) { 556 target.append(oo.separator).append(key) 557 .append(oo.separator).append(value); 558 } else { 559 if (!haveAt) { 560 target.append('@'); 561 haveAt = true; 562 } 563 if (needSep) { 564 target.append(";"); 565 } else { 566 needSep = true; 567 } 568 target.append(key) 569 .append('=').append(value); 570 } 571 } 572 } 573 if (this.localeExtensions.size() != 0) { 574 if (oo == OutputOption.BCP47) { 575 List<String> tValue = localeExtensions.get("t"); 576 if (tValue != null) { 577 result.append(oo.separator).append('t') 578 .append(oo.separator).append(oo.joiner.join(tValue)); 579 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 580 String key = extension.getKey(); 581 if (isTKey(key)) { 582 String value = oo.joiner.join(extension.getValue()); 583 result.append(oo.separator).append(key).append(oo.separator).append(value); 584 } 585 } 586 } 587 boolean haveU = false; 588 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 589 if (!haveU) { 590 List<String> uValue = localeExtensions.get("u"); 591 result.append(oo.separator).append('u'); 592 if (uValue != null) { 593 result.append(oo.separator).append(oo.joiner.join(uValue)); 594 } 595 haveU = true; 596 } 597 String key = extension.getKey(); 598 if (key.length() == 2 && key.charAt(1) >= 'a') { 599 String value = oo.joiner.join(extension.getValue()); 600 result.append(oo.separator).append(key).append(oo.separator).append(value); 601 } 602 } 603 } else { 604 if (!haveAt) { 605 result.append('@'); 606 } 607 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 608 if (needSep) { 609 result.append(";"); 610 } else { 611 needSep = true; 612 } 613 String key = extension.getKey(); 614 String value = oo.joiner.join(extension.getValue()); 615 result.append(key.toUpperCase(Locale.ROOT)) 616 .append('=').append(value.toUpperCase(Locale.ROOT)); 617 } 618 } 619 } 620 // do extensions after u, with x last 621 if (extensionsAfterU != null) { 622 result.append(extensionsAfterU); 623 } 624 if (extensionX != null) { 625 result.append(extensionX); 626 } 627 return result.toString(); 628 } 629 isTKey(String key)630 public static boolean isTKey(String key) { 631 return key.length() == 2 && key.charAt(1) < 'a'; 632 } 633 hasT()634 public boolean hasT() { 635 for (String key : localeExtensions.keySet()) { 636 if (key.equals("t") || isTKey(key)) { 637 return true; 638 } 639 } 640 return false; 641 } 642 643 /** 644 * Return just the language, script, and region (no variants or extensions) 645 * @return 646 */ toLSR()647 public String toLSR() { 648 String result = language; // optimize for the simple cases 649 if (this.script.length() != 0) result += "_" + script; 650 if (this.region.length() != 0) result += "_" + region; 651 return result; 652 } 653 654 public enum Fields { 655 LANGUAGE, SCRIPT, REGION, VARIANTS 656 } 657 658 public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT)); 659 public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION)); 660 public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, 661 Fields.SCRIPT, Fields.REGION)); 662 toString(Set<Fields> selection)663 public String toString(Set<Fields> selection) { 664 String result = language; 665 if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script; 666 if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region; 667 if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { 668 for (String variant : (Collection<String>) variants) { 669 result += "_" + variant; 670 } 671 } 672 return result; 673 } 674 setLanguage(String language)675 public LanguageTagParser setLanguage(String language) { 676 if (SEPARATORS.containsSome(language)) { 677 String oldScript = script; 678 String oldRegion = region; 679 Set<String> oldVariants = variants; 680 set(language); 681 if (script.length() == 0) { 682 script = oldScript; 683 } 684 if (region.length() == 0) { 685 region = oldRegion; 686 } 687 if (oldVariants.size() != 0) { 688 variants = oldVariants; 689 } 690 } else { 691 this.language = language; 692 } 693 return this; 694 } 695 setLocaleExtensions(Map<String, String> localeExtensions)696 public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) { 697 this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE); 698 return this; 699 } 700 setVariants(Collection<String> newVariants)701 public LanguageTagParser setVariants(Collection<String> newVariants) { 702 for (String variant : newVariants) { 703 if (!isValidVariant(variant)) { 704 throw new IllegalArgumentException("Illegal variant: " + variant); 705 } 706 } 707 variants.clear(); 708 variants.addAll(newVariants); 709 return this; 710 } 711 712 static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?"); 713 setExtensions(Map<String, String> newExtensions)714 public LanguageTagParser setExtensions(Map<String, String> newExtensions) { 715 this.extensions = expandMap(newExtensions, 2, 8); 716 return this; 717 } 718 getSimpleParent(String s)719 public static String getSimpleParent(String s) { 720 int lastBar = s.lastIndexOf('_'); 721 return lastBar >= 0 ? s.substring(0, lastBar) : ""; 722 } 723 expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength)724 private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) { 725 if (newLocaleExtensions.isEmpty()) { 726 return Collections.emptyMap(); 727 } 728 ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder(); 729 for (Entry<String, String> entry : newLocaleExtensions.entrySet()) { 730 result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength)); 731 } 732 return result.build(); 733 } 734 split(String value, int minLength, int maxLength)735 private List<String> split(String value, int minLength, int maxLength) { 736 List<String> values = SPLIT_BAR.splitToList(value); 737 for (String s : values) { 738 if (s.length() < minLength || s.length() > maxLength) { 739 throw new IllegalArgumentException("Illegal subtag length for: " + s); 740 } 741 if (!ALPHANUM.containsAll(s)) { 742 throw new IllegalArgumentException("Illegal locale character in: " + s); 743 } 744 } 745 return values; 746 } 747 748 public enum Format {icu("_","_"), bcp47("-","-"), structure("; ", "="); 749 public final String separator; 750 public final String separator2; Format(String separator, String separator2)751 private Format(String separator, String separator2) { 752 this.separator = separator; 753 this.separator2 = separator2; 754 } 755 } 756 toString(Format format)757 public String toString(Format format) { 758 StringBuilder result = new StringBuilder(); 759 if (format == Format.structure) { 760 result.append("["); 761 } 762 appendField(format, result, "language", language); 763 appendField(format, result, "script", script); 764 appendField(format, result, "region", region); 765 appendField(format, result, "variants", variants); 766 appendField(format, result, "extensions", extensions, new UnicodeSet('a','s')); 767 appendField(format, result, "localeX", localeExtensions, null); 768 appendField(format, result, "extensions", extensions, new UnicodeSet('v','w', 'y','z')); 769 appendField(format, result, "extensions", extensions, new UnicodeSet('x','x')); 770 if (format == Format.structure) { 771 result.append("]"); 772 } 773 // if (script.length() != 0) { 774 // result. += "_" + script; 775 // } 776 // if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region; 777 // if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { 778 // for (String variant : (Collection<String>) variants) { 779 // result += "_" + variant; 780 // } 781 // } 782 return result.toString(); 783 } 784 appendField(Format format, StringBuilder result, String fieldName, String fieldValue)785 private void appendField(Format format, StringBuilder result, String fieldName, String fieldValue) { 786 if (!fieldValue.isEmpty()) { 787 if (result.length() > 1) { 788 result.append(format.separator); 789 } 790 if (format == Format.structure) { 791 result.append(fieldName).append("="); 792 } 793 result.append(fieldValue); 794 } 795 } 796 appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue)797 private void appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue) { 798 result.append(format.separator).append(fieldName).append(format.separator2).append(fieldValue); 799 } 800 appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues)801 private void appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues) { 802 if (!fieldValues.isEmpty()) { 803 appendField(format, result, fieldName, Joiner.on(",").join(fieldValues)); 804 } 805 } 806 807 /** 808 * null match means it is -t- or -u- 809 */ appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match)810 private void appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match) { 811 if (match == null && format != Format.structure) { 812 List<String> tLang = fieldValues.get("t"); 813 List<String> uSpecial = fieldValues.get("u"); 814 boolean haveTLang = tLang != null; 815 boolean haveUSpecial = uSpecial != null; 816 817 // do all the keys ending with digits first 818 boolean haveT = false; 819 boolean haveU = false; 820 StringBuilder result2 = new StringBuilder(); // put -u- at end 821 for (Entry<String, List<String>> entry : fieldValues.entrySet()) { 822 String key = entry.getKey(); 823 if (key.length() < 2) { 824 continue; 825 } 826 int lastChar = key.codePointBefore(key.length()); 827 if (lastChar < 'a') { 828 if (!haveT) { 829 result.append(format.separator).append('t'); 830 if (haveTLang) { // empty is illegal, but just in case 831 result.append(format.separator).append( 832 Joiner.on(format.separator).join(tLang)); 833 haveTLang = false; 834 } 835 haveT = true; 836 } 837 appendFieldKey(format, result, entry.getKey(), 838 Joiner.on(format.separator).join(entry.getValue())); 839 } else { 840 if (!haveU) { 841 result2.append(format.separator).append('u'); 842 if (haveUSpecial) { // not yet valid, but just in case 843 result2.append(format.separator).append( 844 Joiner.on(format.separator).join(uSpecial)); 845 haveUSpecial = false; 846 } 847 haveU = true; 848 } 849 appendFieldKey(format, result2, entry.getKey(), 850 Joiner.on(format.separator).join(entry.getValue())); 851 } 852 } 853 if (haveTLang) { 854 result.append(format.separator).append('t').append(format.separator).append( 855 Joiner.on(format.separator).join(tLang)); 856 } 857 if (haveUSpecial) { 858 result2.append(format.separator).append('u').append(format.separator).append( 859 Joiner.on(format.separator).join(uSpecial)); 860 } 861 result.append(result2); // put in right order 862 } else { 863 for (Entry<String, List<String>> entry : fieldValues.entrySet()) { 864 if (match == null || match.contains(entry.getKey())) { 865 appendFieldKey(format, result, entry.getKey(), 866 Joiner.on(format.separator).join(entry.getValue())); 867 } 868 } 869 } 870 } 871 }