1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Author: Mark Davis 7 ********************************************************************** 8 */ 9 package org.unicode.cldr.util; 10 11 import java.util.ArrayList; 12 import java.util.Collection; 13 import java.util.Collections; 14 import java.util.Comparator; 15 import java.util.EnumSet; 16 import java.util.Iterator; 17 import java.util.List; 18 import java.util.Locale; 19 import java.util.Map; 20 import java.util.Map.Entry; 21 import java.util.NoSuchElementException; 22 import java.util.Set; 23 import java.util.StringTokenizer; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import java.util.regex.Pattern; 27 28 import org.unicode.cldr.tool.LikelySubtags; 29 30 import com.google.common.base.CharMatcher; 31 import com.google.common.base.Joiner; 32 import com.google.common.base.Splitter; 33 import com.google.common.collect.ImmutableList; 34 import com.google.common.collect.ImmutableMap; 35 import com.ibm.icu.dev.util.CollectionUtilities; 36 import com.ibm.icu.impl.Row.R2; 37 import com.ibm.icu.text.UnicodeSet; 38 39 public class LanguageTagParser { 40 41 private static final Joiner HYPHEN_JOINER = Joiner.on('-'); 42 43 private static final Comparator<? super String> EXTENSION_ORDER = new Comparator<String>() { 44 45 @Override 46 public int compare(String o1, String o2) { 47 int diff = getBucket(o1) - getBucket(o2); 48 if (diff != 0) { 49 return diff; 50 } 51 return o1.compareTo(o2); 52 } 53 54 private int getBucket(String o1) { 55 switch (o1.length()) { 56 case 1: 57 return o1.charAt(0) == 't' ? 0 : 2; 58 case 2: 59 return o1.charAt(1) <= '9' ? 1 : 3; 60 default: 61 throw new IllegalArgumentException(); 62 } 63 } 64 }; 65 66 /** 67 * @return Returns the language, or "" if none. 68 */ getLanguage()69 public String getLanguage() { 70 return language; 71 } 72 73 /** 74 * @return Returns the script, or "" if none. 75 */ getScript()76 public String getScript() { 77 return script; 78 } 79 80 /** 81 * @return Returns the region, or "" if none. 82 */ getRegion()83 public String getRegion() { 84 return region; 85 } 86 87 /** 88 * @return Returns the variants. 89 */ getVariants()90 public List<String> getVariants() { 91 return ImmutableList.copyOf(variants); 92 } 93 94 /** 95 * @return Returns the grandfathered flag 96 */ isGrandfathered()97 public boolean isGrandfathered() { 98 return grandfathered; 99 } 100 101 /** 102 * @return Returns the extensions. 103 */ 104 @Deprecated getExtensions()105 public Map<String, String> getExtensions() { 106 return OutputOption.ICU.convert(extensions); 107 } 108 109 /** 110 * @return Returns the localeExtensions. 111 */ 112 @Deprecated getLocaleExtensions()113 public Map<String, String> getLocaleExtensions() { 114 return OutputOption.ICU.convert(localeExtensions); 115 } 116 117 /** 118 * @return Returns the extensions. 119 */ getExtensionsDetailed()120 public Map<String, List<String>> getExtensionsDetailed() { 121 return ImmutableMap.copyOf(extensions); 122 } 123 124 /** 125 * @return Returns the localeExtensions. 126 */ getLocaleExtensionsDetailed()127 public Map<String, List<String>> getLocaleExtensionsDetailed() { 128 return ImmutableMap.copyOf(localeExtensions); 129 } 130 131 /** 132 * @return Returns the original, preparsed language tag 133 */ getOriginal()134 public String getOriginal() { 135 return original; 136 } 137 138 /** 139 * @return Returns the language-script (or language) part of a tag. 140 */ getLanguageScript()141 public String getLanguageScript() { 142 if (script.length() != 0) return language + "_" + script; 143 return language; 144 } 145 146 /** 147 * @param in 148 * Collection of language tag strings 149 * @return Returns each of the language-script tags in the collection. 150 */ getLanguageScript(Collection<String> in)151 public static Set<String> getLanguageScript(Collection<String> in) { 152 return getLanguageAndScript(in, null); 153 } 154 155 /** 156 * @param in 157 * Collection of language tag strings 158 * @return Returns each of the language-script tags in the collection. 159 */ getLanguageAndScript(Collection<String> in, Set<String> output)160 public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) { 161 if (output == null) output = new TreeSet<String>(); 162 LanguageTagParser lparser = new LanguageTagParser(); 163 for (Iterator<String> it = in.iterator(); it.hasNext();) { 164 output.add(lparser.set(it.next()).getLanguageScript()); 165 } 166 return output; 167 } 168 169 // private fields 170 171 private String original; 172 private boolean grandfathered = false; 173 private String language; 174 private String script; 175 private String region; 176 private Set<String> variants = new TreeSet<String>(); 177 private Map<String, List<String>> extensions = new TreeMap<String, List<String>>(); // use tree map 178 private Map<String, List<String>> localeExtensions = new TreeMap<String, List<String>>(EXTENSION_ORDER); 179 180 private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze(); 181 private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze(); 182 private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze(); 183 private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze(); 184 private static final UnicodeSet X = new UnicodeSet("[xX]").freeze(); 185 private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze(); 186 private static StandardCodes standardCodes = StandardCodes.make(); 187 private static final Set<String> grandfatheredCodes = standardCodes.getAvailableCodes("grandfathered"); 188 private static final String separator = "-_"; // '-' alone for 3066bis language tags 189 private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze(); 190 private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator)); 191 private static final Splitter SPLIT_COLON = Splitter.on(';'); 192 private static final Splitter SPLIT_EQUAL = Splitter.on('='); 193 private static SupplementalDataInfo SDI = null; // postpone assignment to avoid re-entrance of SupplementalDataInfo.getInstance 194 195 /** 196 * Parses out a language tag, setting a number of fields that can subsequently be retrieved. 197 * If a private-use field is found, it is returned as the last extension.<br> 198 * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see 199 * isValid. 200 * 201 * @param languageTag 202 * @return 203 */ set(String languageTag)204 public LanguageTagParser set(String languageTag) { 205 if (languageTag.length() == 0 || languageTag.equals("root")) { 206 // throw new IllegalArgumentException("Language tag cannot be empty"); 207 // 208 // With ICU 64 the language tag for root is normalized to empty string so we 209 // cannot throw for empty string as above. However, code here and in clients 210 // assumes a non-empty language tag, so for now just map "" or "root" to "und". 211 languageTag = "und"; 212 } else if (languageTag.startsWith("_") || languageTag.startsWith("-")) { 213 languageTag = "und" + languageTag; 214 } 215 languageTag = languageTag.toLowerCase(Locale.ROOT); 216 217 // clear everything out 218 language = region = script = ""; 219 grandfathered = false; 220 variants.clear(); 221 extensions.clear(); 222 localeExtensions.clear(); 223 original = languageTag; 224 int atPosition = languageTag.indexOf('@'); 225 if (atPosition >= 0) { 226 final String extensionsString = languageTag.substring(atPosition + 1).toLowerCase(Locale.ROOT); 227 for (String keyValue : SPLIT_COLON.split(extensionsString)) { 228 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator(); 229 final String key = keyValuePair.next(); 230 final String value = keyValuePair.next(); 231 if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) { 232 throwError(keyValue, "Invalid key/value pair"); 233 } 234 List<String> valueList = SPLIT_BAR.splitToList(value); 235 switch(key.length()) { 236 case 1: 237 extensions.put(key, valueList); 238 break; 239 case 2: 240 localeExtensions.put(key, valueList); 241 break; 242 default: 243 throwError(keyValue, "Invalid key/value pair"); 244 break; 245 } 246 } 247 languageTag = languageTag.substring(0, atPosition); 248 } 249 250 // first test for grandfathered 251 if (grandfatheredCodes.contains(languageTag)) { 252 language = languageTag; 253 grandfathered = true; 254 return this; 255 } 256 257 // each time we fetch a token, we check for length from 1..8, and all alphanum 258 StringTokenizer st = new StringTokenizer(languageTag, separator); 259 String subtag; 260 try { 261 subtag = getSubtag(st); 262 } catch (Exception e1) { 263 throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1); 264 } 265 266 // check for private use (x-...) and return if so 267 if (subtag.equalsIgnoreCase("x")) { 268 getExtension(subtag, st, 1); 269 return this; 270 } 271 272 // check that language subtag is valid 273 if (!ALPHA.containsAll(subtag) || subtag.length() < 2) { 274 throwError(subtag, "Invalid language subtag"); 275 } 276 try { // The try block is to catch the out-of-tokens case. Easier than checking each time. 277 language = subtag; 278 subtag = getSubtag(st); // prepare for next 279 280 // check for script, 4 letters 281 if (subtag.length() == 4 && ALPHA.containsAll(subtag)) { 282 script = subtag; 283 script = script.substring(0, 1).toUpperCase(Locale.ROOT) 284 + script.substring(1); 285 subtag = getSubtag(st); // prepare for next 286 } 287 288 // check for region, 2 letters or 3 digits 289 if (subtag.length() == 2 && ALPHA.containsAll(subtag) 290 || subtag.length() == 3 && DIGIT.containsAll(subtag)) { 291 region = subtag.toUpperCase(Locale.ENGLISH); 292 subtag = getSubtag(st); // prepare for next 293 } 294 295 // get variants: length > 4 or len=4 & starts with digit 296 while (isValidVariant(subtag)) { 297 variants.add(subtag); 298 subtag = getSubtag(st); // prepare for next 299 } 300 301 // get extensions: singleton '-' subtag (2-8 long) 302 while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) { 303 subtag = getExtension(subtag, st, 2); 304 if (subtag == null) return this; // done 305 } 306 307 if (subtag.equalsIgnoreCase("x")) { 308 getExtension(subtag, st, 1); 309 return this; 310 } 311 312 // if we make it to this point, then we have an error 313 throwError(subtag, "Illegal subtag"); 314 315 } catch (NoSuchElementException e) { 316 // this exception just means we ran out of tokens. That's ok, so we just return. 317 } 318 return this; 319 } 320 isValidVariant(String subtag)321 private boolean isValidVariant(String subtag) { 322 return subtag != null && ALPHANUM.containsAll(subtag) 323 && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0))); 324 } 325 326 /** 327 * 328 * @return true iff the language tag validates 329 */ isValid()330 public boolean isValid() { 331 if (grandfathered) return true; // don't need further checking, since we already did so when parsing 332 if (!validates(language, "language")) return false; 333 if (!validates(script, "script")) return false; 334 if (!validates(region, "territory")) return false; 335 for (Iterator<String> it = variants.iterator(); it.hasNext();) { 336 if (!validates(it.next(), "variant")) return false; 337 } 338 return true; // passed the gauntlet 339 } 340 341 public enum Status { 342 WELL_FORMED, VALID, CANONICAL, MINIMAL 343 } 344 getStatus(Set<String> errors)345 public Status getStatus(Set<String> errors) { 346 errors.clear(); 347 if (!isValid()) { 348 return Status.WELL_FORMED; 349 // TODO, check the bcp47 extension codes also 350 } 351 352 if (SDI == null) { 353 SDI = SupplementalDataInfo.getInstance(); 354 } 355 Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo(); 356 Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language"); 357 358 if (aliasInfo.get("language").containsKey(language)) { 359 errors.add("Non-canonical language: " + language); 360 } 361 Map<String, String> lstrInfo = languageInfo.get(language); 362 if (lstrInfo != null) { 363 String scope = lstrInfo.get("Scope"); 364 if ("collection".equals(scope)) { 365 errors.add("Collection language: " + language); 366 } 367 } 368 if (aliasInfo.get("script").containsKey(script)) { 369 errors.add("Non-canonical script: " + script); 370 } 371 if (aliasInfo.get("territory").containsKey(region)) { 372 errors.add("Non-canonical region: " + region); 373 } 374 if (!errors.isEmpty()) { 375 return Status.VALID; 376 } 377 String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region); 378 String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false); 379 if (minimized == null) { 380 errors.add("No minimal data for:" + tag); 381 if (script.isEmpty() && region.isEmpty()) { 382 return Status.MINIMAL; 383 } else { 384 return Status.CANONICAL; 385 } 386 } 387 if (!tag.equals(minimized)) { 388 errors.add("Not minimal:" + tag + "-->" + minimized); 389 return Status.CANONICAL; 390 } 391 return Status.MINIMAL; 392 } 393 394 /** 395 * @param subtag 396 * @param type 397 * @return true if the subtag is empty, or if it is in the registry 398 */ validates(String subtag, String type)399 private boolean validates(String subtag, String type) { 400 return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag); 401 } 402 403 /** 404 * Internal method 405 * 406 * @param minLength 407 * TODO 408 */ getExtension(String subtag, StringTokenizer st, int minLength)409 private String getExtension(String subtag, StringTokenizer st, int minLength) { 410 String base = subtag; 411 final char extension = subtag.charAt(0); 412 if (extensions.containsKey(subtag)) { 413 throwError(subtag, "Can't have two extensions with the same key"); 414 } 415 if (!st.hasMoreElements()) { 416 throwError(subtag, "Private Use / Extension requires subsequent subtag"); 417 } 418 boolean takesSubkeys = extension == 'u' || extension == 't'; 419 boolean firstT = extension == 't'; 420 boolean haveContents = false; 421 List<String> result = new ArrayList<>(); 422 try { 423 while (st.hasMoreElements()) { 424 subtag = getSubtag(st); 425 if (subtag.length() < minLength) { 426 return subtag; 427 } 428 if (takesSubkeys 429 && subtag.length() == 2 430 && (!firstT || isTKey(subtag))) { // start new key-value pair 431 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u- 432 localeExtensions.put(base, ImmutableList.copyOf(result)); 433 haveContents = true; 434 result.clear(); 435 } 436 base = subtag; 437 continue; 438 } 439 firstT = false; 440 result.add(subtag); 441 } 442 return null; 443 } finally { 444 if (takesSubkeys) { 445 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u- 446 localeExtensions.put(base, ImmutableList.copyOf(result)); 447 haveContents = true; 448 } 449 if (!haveContents) { 450 throw new IllegalArgumentException("extension must not be empty: " + base); 451 } 452 } else { 453 if (result.isEmpty()) { 454 throw new IllegalArgumentException("extension must not be empty: " + base); 455 } 456 extensions.put(base, ImmutableList.copyOf(result)); 457 } 458 } 459 } 460 461 /** 462 * Internal method 463 */ getSubtag(StringTokenizer st)464 private String getSubtag(StringTokenizer st) { 465 String result = st.nextToken(); 466 if (result.length() < 1 || result.length() > 8) { 467 throwError(result, "Illegal length (must be 1..8)"); 468 } 469 if (!ALPHANUM.containsAll(result)) { 470 throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")"); 471 } 472 return result; 473 } 474 475 /** 476 * Internal method 477 */ throwError(String subtag, String errorText)478 private void throwError(String subtag, String errorText) { 479 throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original); 480 } 481 setRegion(String region)482 public LanguageTagParser setRegion(String region) { 483 this.region = region; 484 return this; 485 } 486 setScript(String script)487 public LanguageTagParser setScript(String script) { 488 this.script = script; 489 return this; 490 } 491 492 public enum OutputOption { 493 ICU('_'), BCP47('-'); 494 final char separator; 495 final Joiner joiner; 496 OutputOption(char separator)497 private OutputOption(char separator) { 498 this.separator = separator; 499 joiner = Joiner.on(separator); 500 } 501 convert(Map<String, List<String>> mapToList)502 public Map<String, String> convert(Map<String, List<String>> mapToList) { 503 if (mapToList.isEmpty()) { 504 return Collections.emptyMap(); 505 } 506 ImmutableMap.Builder<String, String> builder = ImmutableMap.builder(); 507 for (Entry<String, List<String>> entry : mapToList.entrySet()) { 508 builder.put(entry.getKey(), joiner.join(entry.getValue())); 509 } 510 return builder.build(); 511 } 512 } 513 toString()514 public String toString() { 515 return toString(OutputOption.ICU); 516 } 517 toString(OutputOption oo)518 public String toString(OutputOption oo) { 519 StringBuilder result = new StringBuilder(language); // optimize for the simple cases 520 if (this.script.length() != 0) result.append(oo.separator).append(script); 521 if (this.region.length() != 0) result.append(oo.separator).append(region); 522 if (this.variants.size() != 0) { 523 for (String variant : variants) { 524 result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT)); 525 } 526 } 527 boolean haveAt = false; 528 boolean needSep = false; 529 530 StringBuilder extensionsAfterU = null; 531 StringBuilder extensionX = null; 532 if (this.extensions.size() != 0) { 533 StringBuilder target = result; 534 for (Entry<String, List<String>> extension : extensions.entrySet()) { 535 String key = extension.getKey(); 536 String value = oo.joiner.join(extension.getValue()); 537 switch (key) { 538 case "v": 539 case "w": 540 case "y": 541 case "z": 542 if (extensionsAfterU == null) { 543 extensionsAfterU = new StringBuilder(); 544 } 545 target = extensionsAfterU; 546 break; 547 case "x": 548 if (extensionX == null) { 549 extensionX = new StringBuilder(); 550 } 551 target = extensionX; 552 break; 553 default: 554 // no action; we already have target set right for earlier items. 555 } 556 if (oo == OutputOption.BCP47) { 557 target.append(oo.separator).append(key) 558 .append(oo.separator).append(value); 559 } else { 560 if (!haveAt) { 561 target.append('@'); 562 haveAt = true; 563 } 564 if (needSep) { 565 target.append(";"); 566 } else { 567 needSep = true; 568 } 569 target.append(key) 570 .append('=').append(value); 571 } 572 } 573 } 574 if (this.localeExtensions.size() != 0) { 575 if (oo == OutputOption.BCP47) { 576 List<String> tValue = localeExtensions.get("t"); 577 if (tValue != null) { 578 result.append(oo.separator).append('t') 579 .append(oo.separator).append(oo.joiner.join(tValue)); 580 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 581 String key = extension.getKey(); 582 if (isTKey(key)) { 583 String value = oo.joiner.join(extension.getValue()); 584 result.append(oo.separator).append(key).append(oo.separator).append(value); 585 } 586 } 587 } 588 boolean haveU = false; 589 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 590 if (!haveU) { 591 List<String> uValue = localeExtensions.get("u"); 592 result.append(oo.separator).append('u'); 593 if (uValue != null) { 594 result.append(oo.separator).append(oo.joiner.join(uValue)); 595 } 596 haveU = true; 597 } 598 String key = extension.getKey(); 599 if (key.length() == 2 && key.charAt(1) >= 'a') { 600 String value = oo.joiner.join(extension.getValue()); 601 result.append(oo.separator).append(key).append(oo.separator).append(value); 602 } 603 } 604 } else { 605 if (!haveAt) { 606 result.append('@'); 607 } 608 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 609 if (needSep) { 610 result.append(";"); 611 } else { 612 needSep = true; 613 } 614 String key = extension.getKey(); 615 String value = oo.joiner.join(extension.getValue()); 616 result.append(key.toUpperCase(Locale.ROOT)) 617 .append('=').append(value.toUpperCase(Locale.ROOT)); 618 } 619 } 620 } 621 // do extensions after u, with x last 622 if (extensionsAfterU != null) { 623 result.append(extensionsAfterU); 624 } 625 if (extensionX != null) { 626 result.append(extensionX); 627 } 628 return result.toString(); 629 } 630 isTKey(String key)631 public static boolean isTKey(String key) { 632 return key.length() == 2 && key.charAt(1) < 'a'; 633 } 634 hasT()635 public boolean hasT() { 636 for (String key : localeExtensions.keySet()) { 637 if (key.equals("t") || isTKey(key)) { 638 return true; 639 } 640 } 641 return false; 642 } 643 644 /** 645 * Return just the language, script, and region (no variants or extensions) 646 * @return 647 */ toLSR()648 public String toLSR() { 649 String result = language; // optimize for the simple cases 650 if (this.script.length() != 0) result += "_" + script; 651 if (this.region.length() != 0) result += "_" + region; 652 return result; 653 } 654 655 public enum Fields { 656 LANGUAGE, SCRIPT, REGION, VARIANTS 657 }; 658 659 public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT)); 660 public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION)); 661 public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, 662 Fields.SCRIPT, Fields.REGION)); 663 toString(Set<Fields> selection)664 public String toString(Set<Fields> selection) { 665 String result = language; 666 if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script; 667 if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region; 668 if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { 669 for (String variant : (Collection<String>) variants) { 670 result += "_" + variant; 671 } 672 } 673 return result; 674 } 675 setLanguage(String language)676 public LanguageTagParser setLanguage(String language) { 677 if (SEPARATORS.containsSome(language)) { 678 String oldScript = script; 679 String oldRegion = region; 680 Set<String> oldVariants = variants; 681 set(language); 682 if (script.length() == 0) { 683 script = oldScript; 684 } 685 if (region.length() == 0) { 686 region = oldRegion; 687 } 688 if (oldVariants.size() != 0) { 689 variants = oldVariants; 690 } 691 } else { 692 this.language = language; 693 } 694 return this; 695 } 696 setLocaleExtensions(Map<String, String> localeExtensions)697 public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) { 698 this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE); 699 return this; 700 } 701 setVariants(Collection<String> newVariants)702 public LanguageTagParser setVariants(Collection<String> newVariants) { 703 for (String variant : newVariants) { 704 if (!isValidVariant(variant)) { 705 throw new IllegalArgumentException("Illegal variant: " + variant); 706 } 707 } 708 variants.clear(); 709 variants.addAll(newVariants); 710 return this; 711 } 712 713 static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?"); 714 setExtensions(Map<String, String> newExtensions)715 public LanguageTagParser setExtensions(Map<String, String> newExtensions) { 716 this.extensions = expandMap(newExtensions, 2, 8); 717 return this; 718 } 719 getSimpleParent(String s)720 public static String getSimpleParent(String s) { 721 int lastBar = s.lastIndexOf('_'); 722 return lastBar >= 0 ? s.substring(0, lastBar) : ""; 723 } 724 expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength)725 private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) { 726 if (newLocaleExtensions.isEmpty()) { 727 return Collections.emptyMap(); 728 } 729 ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder(); 730 for (Entry<String, String> entry : newLocaleExtensions.entrySet()) { 731 result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength)); 732 } 733 return result.build(); 734 } 735 split(String value, int minLength, int maxLength)736 private List<String> split(String value, int minLength, int maxLength) { 737 List<String> values = SPLIT_BAR.splitToList(value); 738 for (String s : values) { 739 if (s.length() < minLength || s.length() > maxLength) { 740 throw new IllegalArgumentException("Illegal subtag length for: " + s); 741 } 742 if (!ALPHANUM.containsAll(s)) { 743 throw new IllegalArgumentException("Illegal locale character in: " + s); 744 } 745 } 746 return values; 747 } 748 749 public enum Format {icu("_","_"), bcp47("-","-"), structure("; ", "="); 750 public final String separator; 751 public final String separator2; Format(String separator, String separator2)752 private Format(String separator, String separator2) { 753 this.separator = separator; 754 this.separator2 = separator2; 755 } 756 }; 757 toString(Format format)758 public String toString(Format format) { 759 StringBuilder result = new StringBuilder(); 760 if (format == Format.structure) { 761 result.append("["); 762 } 763 appendField(format, result, "language", language); 764 appendField(format, result, "script", script); 765 appendField(format, result, "region", region); 766 appendField(format, result, "variants", variants); 767 appendField(format, result, "extensions", extensions, new UnicodeSet('a','s')); 768 appendField(format, result, "localeX", localeExtensions, null); 769 appendField(format, result, "extensions", extensions, new UnicodeSet('v','w', 'y','z')); 770 appendField(format, result, "extensions", extensions, new UnicodeSet('x','x')); 771 if (format == Format.structure) { 772 result.append("]"); 773 } 774 // if (script.length() != 0) { 775 // result. += "_" + script; 776 // } 777 // if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region; 778 // if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { 779 // for (String variant : (Collection<String>) variants) { 780 // result += "_" + variant; 781 // } 782 // } 783 return result.toString(); 784 } 785 appendField(Format format, StringBuilder result, String fieldName, String fieldValue)786 private void appendField(Format format, StringBuilder result, String fieldName, String fieldValue) { 787 if (!fieldValue.isEmpty()) { 788 if (result.length() > 1) { 789 result.append(format.separator); 790 } 791 if (format == Format.structure) { 792 result.append(fieldName).append("="); 793 } 794 result.append(fieldValue); 795 } 796 } 797 appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue)798 private void appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue) { 799 result.append(format.separator).append(fieldName).append(format.separator2).append(fieldValue); 800 } 801 appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues)802 private void appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues) { 803 if (!fieldValues.isEmpty()) { 804 appendField(format, result, fieldName, CollectionUtilities.join(fieldValues, ",")); 805 } 806 } 807 808 /** 809 * null match means it is -t- or -u- 810 */ appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match)811 private void appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match) { 812 if (match == null && format != Format.structure) { 813 List<String> tLang = fieldValues.get("t"); 814 List<String> uSpecial = fieldValues.get("u"); 815 boolean haveTLang = tLang != null; 816 boolean haveUSpecial = uSpecial != null; 817 818 // do all the keys ending with digits first 819 boolean haveT = false; 820 boolean haveU = false; 821 StringBuilder result2 = new StringBuilder(); // put -u- at end 822 for (Entry<String, List<String>> entry : fieldValues.entrySet()) { 823 String key = entry.getKey(); 824 if (key.length() < 2) { 825 continue; 826 } 827 int lastChar = key.codePointBefore(key.length()); 828 if (lastChar < 'a') { 829 if (!haveT) { 830 result.append(format.separator).append('t'); 831 if (haveTLang) { // empty is illegal, but just in case 832 result.append(format.separator).append(CollectionUtilities.join(tLang, format.separator)); 833 haveTLang = false; 834 } 835 haveT = true; 836 } 837 appendFieldKey(format, result, entry.getKey(), CollectionUtilities.join(entry.getValue(), format.separator)); 838 } else { 839 if (!haveU) { 840 result2.append(format.separator).append('u'); 841 if (haveUSpecial) { // not yet valid, but just in case 842 result2.append(format.separator).append(CollectionUtilities.join(uSpecial, format.separator)); 843 haveUSpecial = false; 844 } 845 haveU = true; 846 } 847 appendFieldKey(format, result2, entry.getKey(), CollectionUtilities.join(entry.getValue(), format.separator)); 848 } 849 } 850 if (haveTLang) { 851 result.append(format.separator).append('t').append(format.separator).append(CollectionUtilities.join(tLang, format.separator)); 852 } 853 if (haveUSpecial) { 854 result2.append(format.separator).append('u').append(format.separator).append(CollectionUtilities.join(uSpecial, format.separator)); 855 } 856 result.append(result2); // put in right order 857 } else { 858 for (Entry<String, List<String>> entry : fieldValues.entrySet()) { 859 if (match == null || match.contains(entry.getKey())) { 860 appendFieldKey(format, result, entry.getKey(), CollectionUtilities.join(entry.getValue(), format.separator)); 861 } 862 } 863 } 864 } 865 }