1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 2003-2011, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ****************************************************************************** 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.util.Collections; 13 import java.util.Comparator; 14 import java.util.Iterator; 15 import java.util.Map; 16 import java.util.TreeMap; 17 18 import com.ibm.icu.impl.locale.AsciiUtil; 19 20 /** 21 * Utility class to parse and normalize locale ids (including POSIX style) 22 */ 23 public final class LocaleIDParser { 24 25 /** 26 * Char array representing the locale ID. 27 */ 28 private char[] id; 29 30 /** 31 * Current position in {@link #id} (while parsing). 32 */ 33 private int index; 34 35 /** 36 * Temporary buffer for parsed sections of data. 37 */ 38 private StringBuilder buffer; 39 40 // um, don't handle POSIX ids unless we request it. why not? well... because. 41 private boolean canonicalize; 42 private boolean hadCountry; 43 44 // used when canonicalizing 45 Map<String, String> keywords; 46 String baseName; 47 48 /** 49 * Parsing constants. 50 */ 51 private static final char KEYWORD_SEPARATOR = '@'; 52 private static final char HYPHEN = '-'; 53 private static final char KEYWORD_ASSIGN = '='; 54 private static final char COMMA = ','; 55 private static final char ITEM_SEPARATOR = ';'; 56 private static final char DOT = '.'; 57 private static final char UNDERSCORE = '_'; 58 LocaleIDParser(String localeID)59 public LocaleIDParser(String localeID) { 60 this(localeID, false); 61 } 62 LocaleIDParser(String localeID, boolean canonicalize)63 public LocaleIDParser(String localeID, boolean canonicalize) { 64 id = localeID.toCharArray(); 65 index = 0; 66 buffer = new StringBuilder(id.length + 5); 67 this.canonicalize = canonicalize; 68 } 69 reset()70 private void reset() { 71 index = 0; 72 buffer = new StringBuilder(id.length + 5); 73 } 74 75 // utilities for working on text in the buffer 76 77 /** 78 * Append c to the buffer. 79 */ append(char c)80 private void append(char c) { 81 buffer.append(c); 82 } 83 addSeparator()84 private void addSeparator() { 85 append(UNDERSCORE); 86 } 87 88 /** 89 * Returns the text in the buffer from start to blen as a String. 90 */ getString(int start)91 private String getString(int start) { 92 return buffer.substring(start); 93 } 94 95 /** 96 * Set the length of the buffer to pos, then append the string. 97 */ set(int pos, String s)98 private void set(int pos, String s) { 99 buffer.delete(pos, buffer.length()); 100 buffer.insert(pos, s); 101 } 102 103 /** 104 * Append the string to the buffer. 105 */ append(String s)106 private void append(String s) { 107 buffer.append(s); 108 } 109 110 // utilities for parsing text out of the id 111 112 /** 113 * Character to indicate no more text is available in the id. 114 */ 115 private static final char DONE = '\uffff'; 116 117 /** 118 * Returns the character at index in the id, and advance index. The returned character 119 * is DONE if index was at the limit of the buffer. The index is advanced regardless 120 * so that decrementing the index will always 'unget' the last character returned. 121 */ next()122 private char next() { 123 if (index == id.length) { 124 index++; 125 return DONE; 126 } 127 128 return id[index++]; 129 } 130 131 /** 132 * Advance index until the next terminator or id separator, and leave it there. 133 */ skipUntilTerminatorOrIDSeparator()134 private void skipUntilTerminatorOrIDSeparator() { 135 while (!isTerminatorOrIDSeparator(next())); 136 --index; 137 } 138 139 /** 140 * Returns true if the character at index in the id is a terminator. 141 */ atTerminator()142 private boolean atTerminator() { 143 return index >= id.length || isTerminator(id[index]); 144 } 145 146 /** 147 * Returns true if the character is a terminator (keyword separator, dot, or DONE). 148 * Dot is a terminator because of the POSIX form, where dot precedes the codepage. 149 */ isTerminator(char c)150 private boolean isTerminator(char c) { 151 // always terminate at DOT, even if not handling POSIX. It's an error... 152 return c == KEYWORD_SEPARATOR || c == DONE || c == DOT; 153 } 154 155 /** 156 * Returns true if the character is a terminator or id separator. 157 */ isTerminatorOrIDSeparator(char c)158 private boolean isTerminatorOrIDSeparator(char c) { 159 return c == UNDERSCORE || c == HYPHEN || isTerminator(c); 160 } 161 162 /** 163 * Returns true if the start of the buffer has an experimental or private language 164 * prefix, the pattern '[ixIX][-_].' shows the syntax checked. 165 */ haveExperimentalLanguagePrefix()166 private boolean haveExperimentalLanguagePrefix() { 167 if (id.length > 2) { 168 char c = id[1]; 169 if (c == HYPHEN || c == UNDERSCORE) { 170 c = id[0]; 171 return c == 'x' || c == 'X' || c == 'i' || c == 'I'; 172 } 173 } 174 return false; 175 } 176 177 /** 178 * Returns true if a value separator occurs at or after index. 179 */ haveKeywordAssign()180 private boolean haveKeywordAssign() { 181 // assume it is safe to start from index 182 for (int i = index; i < id.length; ++i) { 183 if (id[i] == KEYWORD_ASSIGN) { 184 return true; 185 } 186 } 187 return false; 188 } 189 190 /** 191 * Advance index past language, and accumulate normalized language code in buffer. 192 * Index must be at 0 when this is called. Index is left at a terminator or id 193 * separator. Returns the start of the language code in the buffer. 194 */ parseLanguage()195 private int parseLanguage() { 196 int startLength = buffer.length(); 197 198 if (haveExperimentalLanguagePrefix()) { 199 append(AsciiUtil.toLower(id[0])); 200 append(HYPHEN); 201 index = 2; 202 } 203 204 char c; 205 while(!isTerminatorOrIDSeparator(c = next())) { 206 append(AsciiUtil.toLower(c)); 207 } 208 --index; // unget 209 210 if (buffer.length() - startLength == 3) { 211 String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0)); 212 if (lang != null) { 213 set(0, lang); 214 } 215 } 216 217 return 0; 218 } 219 220 /** 221 * Advance index past language. Index must be at 0 when this is called. Index 222 * is left at a terminator or id separator. 223 */ skipLanguage()224 private void skipLanguage() { 225 if (haveExperimentalLanguagePrefix()) { 226 index = 2; 227 } 228 skipUntilTerminatorOrIDSeparator(); 229 } 230 231 /** 232 * Advance index past script, and accumulate normalized script in buffer. 233 * Index must be immediately after the language. 234 * If the item at this position is not a script (is not four characters 235 * long) leave index and buffer unchanged. Otherwise index is left at 236 * a terminator or id separator. Returns the start of the script code 237 * in the buffer (this may be equal to the buffer length, if there is no 238 * script). 239 */ parseScript()240 private int parseScript() { 241 if (!atTerminator()) { 242 int oldIndex = index; // save original index 243 ++index; 244 245 int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone 246 char c; 247 boolean firstPass = true; 248 while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) { 249 if (firstPass) { 250 addSeparator(); 251 append(AsciiUtil.toUpper(c)); 252 firstPass = false; 253 } else { 254 append(AsciiUtil.toLower(c)); 255 } 256 } 257 --index; // unget 258 259 /* If it's not exactly 4 characters long, then it's not a script. */ 260 if (index - oldIndex != 5) { // +1 to account for separator 261 index = oldIndex; 262 buffer.delete(oldBlen, buffer.length()); 263 } else { 264 oldBlen++; // index past hyphen, for clients who want to extract just the script 265 } 266 267 return oldBlen; 268 } 269 return buffer.length(); 270 } 271 272 /** 273 * Advance index past script. 274 * Index must be immediately after the language and IDSeparator. 275 * If the item at this position is not a script (is not four characters 276 * long) leave index. Otherwise index is left at a terminator or 277 * id separator. 278 */ skipScript()279 private void skipScript() { 280 if (!atTerminator()) { 281 int oldIndex = index; 282 ++index; 283 284 char c; 285 while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)); 286 --index; 287 288 if (index - oldIndex != 5) { // +1 to account for separator 289 index = oldIndex; 290 } 291 } 292 } 293 294 /** 295 * Advance index past country, and accumulate normalized country in buffer. 296 * Index must be immediately after the script (if there is one, else language) 297 * and IDSeparator. Return the start of the country code in the buffer. 298 */ parseCountry()299 private int parseCountry() { 300 if (!atTerminator()) { 301 int oldIndex = index; 302 ++index; 303 304 int oldBlen = buffer.length(); 305 char c; 306 boolean firstPass = true; 307 while (!isTerminatorOrIDSeparator(c = next())) { 308 if (firstPass) { // first, add hyphen 309 hadCountry = true; // we have a country, let variant parsing know 310 addSeparator(); 311 ++oldBlen; // increment past hyphen 312 firstPass = false; 313 } 314 append(AsciiUtil.toUpper(c)); 315 } 316 --index; // unget 317 318 int charsAppended = buffer.length() - oldBlen; 319 320 if (charsAppended == 0) { 321 // Do nothing. 322 } 323 else if (charsAppended < 2 || charsAppended > 3) { 324 // It's not a country, so return index and blen to 325 // their previous values. 326 index = oldIndex; 327 --oldBlen; 328 buffer.delete(oldBlen, buffer.length()); 329 hadCountry = false; 330 } 331 else if (charsAppended == 3) { 332 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen)); 333 if (region != null) { 334 set(oldBlen, region); 335 } 336 } 337 338 return oldBlen; 339 } 340 341 return buffer.length(); 342 } 343 344 /** 345 * Advance index past country. 346 * Index must be immediately after the script (if there is one, else language) 347 * and IDSeparator. 348 */ skipCountry()349 private void skipCountry() { 350 if (!atTerminator()) { 351 if (id[index] == UNDERSCORE || id[index] == HYPHEN) { 352 ++index; 353 } 354 /* 355 * Save the index point after the separator, since the format 356 * requires two separators if the country is not present. 357 */ 358 int oldIndex = index; 359 360 skipUntilTerminatorOrIDSeparator(); 361 int charsSkipped = index - oldIndex; 362 if (charsSkipped < 2 || charsSkipped > 3) { 363 index = oldIndex; 364 } 365 } 366 } 367 368 // There are no strict limitation of the syntax of variant in the legacy 369 // locale format. If the locale is constructed from unicode_locale_id 370 // as defined in UTS35, then we know each unicode_variant_subtag 371 // could have max length of 8 ((alphanum{5,8} | digit alphanum{3}) 372 // 179 would allow 20 unicode_variant_subtag with sep in the 373 // unicode_locale_id 374 // 8*20 + 1*(20-1) = 179 375 private static final int MAX_VARIANTS_LENGTH = 179; 376 377 /** 378 * Advance index past variant, and accumulate normalized variant in buffer. This ignores 379 * the codepage information from POSIX ids. Index must be immediately after the country 380 * or script. Index is left at the keyword separator or at the end of the text. Return 381 * the start of the variant code in the buffer. 382 * 383 * In standard form, we can have the following forms: 384 * ll__VVVV 385 * ll_CC_VVVV 386 * ll_Ssss_VVVV 387 * ll_Ssss_CC_VVVV 388 * 389 * This also handles POSIX ids, which can have the following forms (pppp is code page id): 390 * ll_CC.pppp --> ll_CC 391 * ll_CC.pppp@VVVV --> ll_CC_VVVV 392 * ll_CC@VVVV --> ll_CC_VVVV 393 * 394 * We identify this use of '@' in POSIX ids by looking for an '=' following 395 * the '@'. If there is one, we consider '@' to start a keyword list, instead of 396 * being part of a POSIX id. 397 * 398 * Note: since it was decided that we want an option to not handle POSIX ids, this 399 * becomes a bit more complex. 400 */ parseVariant()401 private int parseVariant() { 402 int oldBlen = buffer.length(); 403 404 boolean start = true; 405 boolean needSeparator = true; 406 boolean skipping = false; 407 char c; 408 boolean firstPass = true; 409 410 while ((c = next()) != DONE) { 411 if (c == DOT) { 412 start = false; 413 skipping = true; 414 } else if (c == KEYWORD_SEPARATOR) { 415 if (haveKeywordAssign()) { 416 break; 417 } 418 skipping = false; 419 start = false; 420 needSeparator = true; // add another underscore if we have more text 421 } else if (start) { 422 start = false; 423 if (c != UNDERSCORE && c != HYPHEN) { 424 index--; 425 } 426 } else if (!skipping) { 427 if (needSeparator) { 428 needSeparator = false; 429 if (firstPass && !hadCountry) { // no country, we'll need two 430 addSeparator(); 431 ++oldBlen; // for sure 432 } 433 addSeparator(); 434 if (firstPass) { // only for the first separator 435 ++oldBlen; 436 firstPass = false; 437 } 438 } 439 c = AsciiUtil.toUpper(c); 440 if (c == HYPHEN || c == COMMA) { 441 c = UNDERSCORE; 442 } 443 append(c); 444 if (buffer.length() - oldBlen > MAX_VARIANTS_LENGTH) { 445 throw new IllegalArgumentException("variants is too long"); 446 } 447 } 448 } 449 --index; // unget 450 return oldBlen; 451 } 452 453 // no need for skipvariant, to get the keywords we'll just scan directly for 454 // the keyword separator 455 456 /** 457 * Returns the normalized language id, or the empty string. 458 */ getLanguage()459 public String getLanguage() { 460 reset(); 461 return getString(parseLanguage()); 462 } 463 464 /** 465 * Returns the normalized script id, or the empty string. 466 */ getScript()467 public String getScript() { 468 reset(); 469 skipLanguage(); 470 return getString(parseScript()); 471 } 472 473 /** 474 * return the normalized country id, or the empty string. 475 */ getCountry()476 public String getCountry() { 477 reset(); 478 skipLanguage(); 479 skipScript(); 480 return getString(parseCountry()); 481 } 482 483 /** 484 * Returns the normalized variant id, or the empty string. 485 */ getVariant()486 public String getVariant() { 487 reset(); 488 skipLanguage(); 489 skipScript(); 490 skipCountry(); 491 return getString(parseVariant()); 492 } 493 494 /** 495 * Returns the language, script, country, and variant as separate strings. 496 */ getLanguageScriptCountryVariant()497 public String[] getLanguageScriptCountryVariant() { 498 reset(); 499 return new String[] { 500 getString(parseLanguage()), 501 getString(parseScript()), 502 getString(parseCountry()), 503 getString(parseVariant()) 504 }; 505 } 506 setBaseName(String baseName)507 public void setBaseName(String baseName) { 508 this.baseName = baseName; 509 } 510 parseBaseName()511 public void parseBaseName() { 512 if (baseName != null) { 513 set(0, baseName); 514 } else { 515 reset(); 516 parseLanguage(); 517 parseScript(); 518 parseCountry(); 519 parseVariant(); 520 521 // catch unwanted trailing underscore after country if there was no variant 522 int len = buffer.length(); 523 if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) { 524 buffer.deleteCharAt(len - 1); 525 } 526 } 527 } 528 529 /** 530 * Returns the normalized base form of the locale id. The base 531 * form does not include keywords. 532 */ getBaseName()533 public String getBaseName() { 534 if (baseName != null) { 535 return baseName; 536 } 537 parseBaseName(); 538 return getString(0); 539 } 540 541 /** 542 * Returns the normalized full form of the locale id. The full 543 * form includes keywords if they are present. 544 */ getName()545 public String getName() { 546 parseBaseName(); 547 parseKeywords(); 548 return getString(0); 549 } 550 551 // keyword utilities 552 553 /** 554 * If we have keywords, advance index to the start of the keywords and return true, 555 * otherwise return false. 556 */ setToKeywordStart()557 private boolean setToKeywordStart() { 558 for (int i = index; i < id.length; ++i) { 559 if (id[i] == KEYWORD_SEPARATOR) { 560 if (canonicalize) { 561 for (int j = ++i; j < id.length; ++j) { // increment i past separator for return 562 if (id[j] == KEYWORD_ASSIGN) { 563 index = i; 564 return true; 565 } 566 } 567 } else { 568 if (++i < id.length) { 569 index = i; 570 return true; 571 } 572 } 573 break; 574 } 575 } 576 return false; 577 } 578 isDoneOrKeywordAssign(char c)579 private static boolean isDoneOrKeywordAssign(char c) { 580 return c == DONE || c == KEYWORD_ASSIGN; 581 } 582 isDoneOrItemSeparator(char c)583 private static boolean isDoneOrItemSeparator(char c) { 584 return c == DONE || c == ITEM_SEPARATOR; 585 } 586 getKeyword()587 private String getKeyword() { 588 int start = index; 589 while (!isDoneOrKeywordAssign(next())) { 590 } 591 --index; 592 return AsciiUtil.toLowerString(new String(id, start, index-start).trim()); 593 } 594 getValue()595 private String getValue() { 596 int start = index; 597 while (!isDoneOrItemSeparator(next())) { 598 } 599 --index; 600 return new String(id, start, index-start).trim(); // leave case alone 601 } 602 getKeyComparator()603 private Comparator<String> getKeyComparator() { 604 final Comparator<String> comp = new Comparator<String>() { 605 @Override 606 public int compare(String lhs, String rhs) { 607 return lhs.compareTo(rhs); 608 } 609 }; 610 return comp; 611 } 612 613 /** 614 * Returns a map of the keywords and values, or null if there are none. 615 */ getKeywordMap()616 public Map<String, String> getKeywordMap() { 617 if (keywords == null) { 618 TreeMap<String, String> m = null; 619 if (setToKeywordStart()) { 620 // trim spaces and convert to lower case, both keywords and values. 621 do { 622 String key = getKeyword(); 623 if (key.length() == 0) { 624 break; 625 } 626 char c = next(); 627 if (c != KEYWORD_ASSIGN) { 628 // throw new IllegalArgumentException("key '" + key + "' missing a value."); 629 if (c == DONE) { 630 break; 631 } else { 632 continue; 633 } 634 } 635 String value = getValue(); 636 if (value.length() == 0) { 637 // throw new IllegalArgumentException("key '" + key + "' missing a value."); 638 continue; 639 } 640 if (m == null) { 641 m = new TreeMap<String, String>(getKeyComparator()); 642 } else if (m.containsKey(key)) { 643 // throw new IllegalArgumentException("key '" + key + "' already has a value."); 644 continue; 645 } 646 m.put(key, value); 647 } while (next() == ITEM_SEPARATOR); 648 } 649 keywords = m != null ? m : Collections.<String, String>emptyMap(); 650 } 651 652 return keywords; 653 } 654 655 656 /** 657 * Parse the keywords and return start of the string in the buffer. 658 */ parseKeywords()659 private int parseKeywords() { 660 int oldBlen = buffer.length(); 661 Map<String, String> m = getKeywordMap(); 662 if (!m.isEmpty()) { 663 boolean first = true; 664 for (Map.Entry<String, String> e : m.entrySet()) { 665 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR); 666 first = false; 667 append(e.getKey()); 668 append(KEYWORD_ASSIGN); 669 append(e.getValue()); 670 } 671 if (first == false) { 672 ++oldBlen; 673 } 674 } 675 return oldBlen; 676 } 677 678 /** 679 * Returns an iterator over the keywords, or null if we have an empty map. 680 */ getKeywords()681 public Iterator<String> getKeywords() { 682 Map<String, String> m = getKeywordMap(); 683 return m.isEmpty() ? null : m.keySet().iterator(); 684 } 685 686 /** 687 * Returns the value for the named keyword, or null if the keyword is not 688 * present. 689 */ getKeywordValue(String keywordName)690 public String getKeywordValue(String keywordName) { 691 Map<String, String> m = getKeywordMap(); 692 return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim())); 693 } 694 695 /** 696 * Set the keyword value only if it is not already set to something else. 697 */ defaultKeywordValue(String keywordName, String value)698 public void defaultKeywordValue(String keywordName, String value) { 699 setKeywordValue(keywordName, value, false); 700 } 701 702 /** 703 * Set the value for the named keyword, or unset it if value is null. If 704 * keywordName itself is null, unset all keywords. If keywordName is not null, 705 * value must not be null. 706 */ setKeywordValue(String keywordName, String value)707 public void setKeywordValue(String keywordName, String value) { 708 setKeywordValue(keywordName, value, true); 709 } 710 711 /** 712 * Set the value for the named keyword, or unset it if value is null. If 713 * keywordName itself is null, unset all keywords. If keywordName is not null, 714 * value must not be null. If reset is true, ignore any previous value for 715 * the keyword, otherwise do not change the keyword (including removal of 716 * one or all keywords). 717 */ setKeywordValue(String keywordName, String value, boolean reset)718 private void setKeywordValue(String keywordName, String value, boolean reset) { 719 if (keywordName == null) { 720 if (reset) { 721 // force new map, ignore value 722 keywords = Collections.<String, String>emptyMap(); 723 } 724 } else { 725 keywordName = AsciiUtil.toLowerString(keywordName.trim()); 726 if (keywordName.length() == 0) { 727 throw new IllegalArgumentException("keyword must not be empty"); 728 } 729 if (value != null) { 730 value = value.trim(); 731 if (value.length() == 0) { 732 throw new IllegalArgumentException("value must not be empty"); 733 } 734 } 735 Map<String, String> m = getKeywordMap(); 736 if (m.isEmpty()) { // it is EMPTY_MAP 737 if (value != null) { 738 // force new map 739 keywords = new TreeMap<String, String>(getKeyComparator()); 740 keywords.put(keywordName, value.trim()); 741 } 742 } else { 743 if (reset || !m.containsKey(keywordName)) { 744 if (value != null) { 745 m.put(keywordName, value); 746 } else { 747 m.remove(keywordName); 748 if (m.isEmpty()) { 749 // force new map 750 keywords = Collections.<String, String>emptyMap(); 751 } 752 } 753 } 754 } 755 } 756 } 757 } 758