1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 2003-2011, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ****************************************************************************** 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.util.Collections; 13 import java.util.Comparator; 14 import java.util.Iterator; 15 import java.util.Map; 16 import java.util.TreeMap; 17 18 import com.ibm.icu.impl.locale.AsciiUtil; 19 20 /** 21 * Utility class to parse and normalize locale ids (including POSIX style) 22 */ 23 public final class LocaleIDParser { 24 25 /** 26 * Char array representing the locale ID. 27 */ 28 private char[] id; 29 30 /** 31 * Current position in {@link #id} (while parsing). 32 */ 33 private int index; 34 35 /** 36 * Temporary buffer for parsed sections of data. 37 */ 38 private StringBuilder buffer; 39 40 // um, don't handle POSIX ids unless we request it. why not? well... because. 41 private boolean canonicalize; 42 private boolean hadCountry; 43 44 // used when canonicalizing 45 Map<String, String> keywords; 46 String baseName; 47 48 /** 49 * Parsing constants. 50 */ 51 private static final char KEYWORD_SEPARATOR = '@'; 52 private static final char HYPHEN = '-'; 53 private static final char KEYWORD_ASSIGN = '='; 54 private static final char COMMA = ','; 55 private static final char ITEM_SEPARATOR = ';'; 56 private static final char DOT = '.'; 57 private static final char UNDERSCORE = '_'; 58 LocaleIDParser(String localeID)59 public LocaleIDParser(String localeID) { 60 this(localeID, false); 61 } 62 LocaleIDParser(String localeID, boolean canonicalize)63 public LocaleIDParser(String localeID, boolean canonicalize) { 64 id = localeID.toCharArray(); 65 index = 0; 66 buffer = new StringBuilder(id.length + 5); 67 this.canonicalize = canonicalize; 68 } 69 reset()70 private void reset() { 71 index = 0; 72 buffer = new StringBuilder(id.length + 5); 73 } 74 75 // utilities for working on text in the buffer 76 77 /** 78 * Append c to the buffer. 79 */ append(char c)80 private void append(char c) { 81 buffer.append(c); 82 } 83 addSeparator()84 private void addSeparator() { 85 append(UNDERSCORE); 86 } 87 88 /** 89 * Returns the text in the buffer from start to blen as a String. 90 */ getString(int start)91 private String getString(int start) { 92 return buffer.substring(start); 93 } 94 95 /** 96 * Set the length of the buffer to pos, then append the string. 97 */ set(int pos, String s)98 private void set(int pos, String s) { 99 buffer.delete(pos, buffer.length()); 100 buffer.insert(pos, s); 101 } 102 103 /** 104 * Append the string to the buffer. 105 */ append(String s)106 private void append(String s) { 107 buffer.append(s); 108 } 109 110 // utilities for parsing text out of the id 111 112 /** 113 * Character to indicate no more text is available in the id. 114 */ 115 private static final char DONE = '\uffff'; 116 117 /** 118 * Returns the character at index in the id, and advance index. The returned character 119 * is DONE if index was at the limit of the buffer. The index is advanced regardless 120 * so that decrementing the index will always 'unget' the last character returned. 121 */ next()122 private char next() { 123 if (index == id.length) { 124 index++; 125 return DONE; 126 } 127 128 return id[index++]; 129 } 130 131 /** 132 * Advance index until the next terminator or id separator, and leave it there. 133 */ skipUntilTerminatorOrIDSeparator()134 private void skipUntilTerminatorOrIDSeparator() { 135 while (!isTerminatorOrIDSeparator(next())); 136 --index; 137 } 138 139 /** 140 * Returns true if the character at index in the id is a terminator. 141 */ atTerminator()142 private boolean atTerminator() { 143 return index >= id.length || isTerminator(id[index]); 144 } 145 146 /** 147 * Returns true if the character is a terminator (keyword separator, dot, or DONE). 148 * Dot is a terminator because of the POSIX form, where dot precedes the codepage. 149 */ isTerminator(char c)150 private boolean isTerminator(char c) { 151 // always terminate at DOT, even if not handling POSIX. It's an error... 152 return c == KEYWORD_SEPARATOR || c == DONE || c == DOT; 153 } 154 155 /** 156 * Returns true if the character is a terminator or id separator. 157 */ isTerminatorOrIDSeparator(char c)158 private boolean isTerminatorOrIDSeparator(char c) { 159 return c == UNDERSCORE || c == HYPHEN || isTerminator(c); 160 } 161 162 /** 163 * Returns true if the start of the buffer has an experimental or private language 164 * prefix, the pattern '[ixIX][-_].' shows the syntax checked. 165 */ haveExperimentalLanguagePrefix()166 private boolean haveExperimentalLanguagePrefix() { 167 if (id.length > 2) { 168 char c = id[1]; 169 if (c == HYPHEN || c == UNDERSCORE) { 170 c = id[0]; 171 return c == 'x' || c == 'X' || c == 'i' || c == 'I'; 172 } 173 } 174 return false; 175 } 176 177 /** 178 * Returns true if a value separator occurs at or after index. 179 */ haveKeywordAssign()180 private boolean haveKeywordAssign() { 181 // assume it is safe to start from index 182 for (int i = index; i < id.length; ++i) { 183 if (id[i] == KEYWORD_ASSIGN) { 184 return true; 185 } 186 } 187 return false; 188 } 189 190 /** 191 * Advance index past language, and accumulate normalized language code in buffer. 192 * Index must be at 0 when this is called. Index is left at a terminator or id 193 * separator. Returns the start of the language code in the buffer. 194 */ parseLanguage()195 private int parseLanguage() { 196 int startLength = buffer.length(); 197 198 if (haveExperimentalLanguagePrefix()) { 199 append(AsciiUtil.toLower(id[0])); 200 append(HYPHEN); 201 index = 2; 202 } 203 204 char c; 205 while(!isTerminatorOrIDSeparator(c = next())) { 206 append(AsciiUtil.toLower(c)); 207 } 208 --index; // unget 209 210 if (buffer.length() - startLength == 3) { 211 String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0)); 212 if (lang != null) { 213 set(0, lang); 214 } 215 } 216 217 return 0; 218 } 219 220 /** 221 * Advance index past language. Index must be at 0 when this is called. Index 222 * is left at a terminator or id separator. 223 */ skipLanguage()224 private void skipLanguage() { 225 if (haveExperimentalLanguagePrefix()) { 226 index = 2; 227 } 228 skipUntilTerminatorOrIDSeparator(); 229 } 230 231 /** 232 * Advance index past script, and accumulate normalized script in buffer. 233 * Index must be immediately after the language. 234 * If the item at this position is not a script (is not four characters 235 * long) leave index and buffer unchanged. Otherwise index is left at 236 * a terminator or id separator. Returns the start of the script code 237 * in the buffer (this may be equal to the buffer length, if there is no 238 * script). 239 */ parseScript()240 private int parseScript() { 241 if (!atTerminator()) { 242 int oldIndex = index; // save original index 243 ++index; 244 245 int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone 246 char c; 247 boolean firstPass = true; 248 while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) { 249 if (firstPass) { 250 addSeparator(); 251 append(AsciiUtil.toUpper(c)); 252 firstPass = false; 253 } else { 254 append(AsciiUtil.toLower(c)); 255 } 256 } 257 --index; // unget 258 259 /* If it's not exactly 4 characters long, then it's not a script. */ 260 if (index - oldIndex != 5) { // +1 to account for separator 261 index = oldIndex; 262 buffer.delete(oldBlen, buffer.length()); 263 } else { 264 oldBlen++; // index past hyphen, for clients who want to extract just the script 265 } 266 267 return oldBlen; 268 } 269 return buffer.length(); 270 } 271 272 /** 273 * Advance index past script. 274 * Index must be immediately after the language and IDSeparator. 275 * If the item at this position is not a script (is not four characters 276 * long) leave index. Otherwise index is left at a terminator or 277 * id separator. 278 */ skipScript()279 private void skipScript() { 280 if (!atTerminator()) { 281 int oldIndex = index; 282 ++index; 283 284 char c; 285 while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)); 286 --index; 287 288 if (index - oldIndex != 5) { // +1 to account for separator 289 index = oldIndex; 290 } 291 } 292 } 293 294 /** 295 * Advance index past country, and accumulate normalized country in buffer. 296 * Index must be immediately after the script (if there is one, else language) 297 * and IDSeparator. Return the start of the country code in the buffer. 298 */ parseCountry()299 private int parseCountry() { 300 if (!atTerminator()) { 301 int oldIndex = index; 302 ++index; 303 304 int oldBlen = buffer.length(); 305 char c; 306 boolean firstPass = true; 307 while (!isTerminatorOrIDSeparator(c = next())) { 308 if (firstPass) { // first, add hyphen 309 hadCountry = true; // we have a country, let variant parsing know 310 addSeparator(); 311 ++oldBlen; // increment past hyphen 312 firstPass = false; 313 } 314 append(AsciiUtil.toUpper(c)); 315 } 316 --index; // unget 317 318 int charsAppended = buffer.length() - oldBlen; 319 320 if (charsAppended == 0) { 321 // Do nothing. 322 } 323 else if (charsAppended < 2 || charsAppended > 3) { 324 // It's not a country, so return index and blen to 325 // their previous values. 326 index = oldIndex; 327 --oldBlen; 328 buffer.delete(oldBlen, buffer.length()); 329 hadCountry = false; 330 } 331 else if (charsAppended == 3) { 332 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen)); 333 if (region != null) { 334 set(oldBlen, region); 335 } 336 } 337 338 return oldBlen; 339 } 340 341 return buffer.length(); 342 } 343 344 /** 345 * Advance index past country. 346 * Index must be immediately after the script (if there is one, else language) 347 * and IDSeparator. 348 */ skipCountry()349 private void skipCountry() { 350 if (!atTerminator()) { 351 if (id[index] == UNDERSCORE || id[index] == HYPHEN) { 352 ++index; 353 } 354 /* 355 * Save the index point after the separator, since the format 356 * requires two separators if the country is not present. 357 */ 358 int oldIndex = index; 359 360 skipUntilTerminatorOrIDSeparator(); 361 int charsSkipped = index - oldIndex; 362 if (charsSkipped < 2 || charsSkipped > 3) { 363 index = oldIndex; 364 } 365 } 366 } 367 368 /** 369 * Advance index past variant, and accumulate normalized variant in buffer. This ignores 370 * the codepage information from POSIX ids. Index must be immediately after the country 371 * or script. Index is left at the keyword separator or at the end of the text. Return 372 * the start of the variant code in the buffer. 373 * 374 * In standard form, we can have the following forms: 375 * ll__VVVV 376 * ll_CC_VVVV 377 * ll_Ssss_VVVV 378 * ll_Ssss_CC_VVVV 379 * 380 * This also handles POSIX ids, which can have the following forms (pppp is code page id): 381 * ll_CC.pppp --> ll_CC 382 * ll_CC.pppp@VVVV --> ll_CC_VVVV 383 * ll_CC@VVVV --> ll_CC_VVVV 384 * 385 * We identify this use of '@' in POSIX ids by looking for an '=' following 386 * the '@'. If there is one, we consider '@' to start a keyword list, instead of 387 * being part of a POSIX id. 388 * 389 * Note: since it was decided that we want an option to not handle POSIX ids, this 390 * becomes a bit more complex. 391 */ parseVariant()392 private int parseVariant() { 393 int oldBlen = buffer.length(); 394 395 boolean start = true; 396 boolean needSeparator = true; 397 boolean skipping = false; 398 char c; 399 boolean firstPass = true; 400 401 while ((c = next()) != DONE) { 402 if (c == DOT) { 403 start = false; 404 skipping = true; 405 } else if (c == KEYWORD_SEPARATOR) { 406 if (haveKeywordAssign()) { 407 break; 408 } 409 skipping = false; 410 start = false; 411 needSeparator = true; // add another underscore if we have more text 412 } else if (start) { 413 start = false; 414 if (c != UNDERSCORE && c != HYPHEN) { 415 index--; 416 } 417 } else if (!skipping) { 418 if (needSeparator) { 419 needSeparator = false; 420 if (firstPass && !hadCountry) { // no country, we'll need two 421 addSeparator(); 422 ++oldBlen; // for sure 423 } 424 addSeparator(); 425 if (firstPass) { // only for the first separator 426 ++oldBlen; 427 firstPass = false; 428 } 429 } 430 c = AsciiUtil.toUpper(c); 431 if (c == HYPHEN || c == COMMA) { 432 c = UNDERSCORE; 433 } 434 append(c); 435 } 436 } 437 --index; // unget 438 439 return oldBlen; 440 } 441 442 // no need for skipvariant, to get the keywords we'll just scan directly for 443 // the keyword separator 444 445 /** 446 * Returns the normalized language id, or the empty string. 447 */ getLanguage()448 public String getLanguage() { 449 reset(); 450 return getString(parseLanguage()); 451 } 452 453 /** 454 * Returns the normalized script id, or the empty string. 455 */ getScript()456 public String getScript() { 457 reset(); 458 skipLanguage(); 459 return getString(parseScript()); 460 } 461 462 /** 463 * return the normalized country id, or the empty string. 464 */ getCountry()465 public String getCountry() { 466 reset(); 467 skipLanguage(); 468 skipScript(); 469 return getString(parseCountry()); 470 } 471 472 /** 473 * Returns the normalized variant id, or the empty string. 474 */ getVariant()475 public String getVariant() { 476 reset(); 477 skipLanguage(); 478 skipScript(); 479 skipCountry(); 480 return getString(parseVariant()); 481 } 482 483 /** 484 * Returns the language, script, country, and variant as separate strings. 485 */ getLanguageScriptCountryVariant()486 public String[] getLanguageScriptCountryVariant() { 487 reset(); 488 return new String[] { 489 getString(parseLanguage()), 490 getString(parseScript()), 491 getString(parseCountry()), 492 getString(parseVariant()) 493 }; 494 } 495 setBaseName(String baseName)496 public void setBaseName(String baseName) { 497 this.baseName = baseName; 498 } 499 parseBaseName()500 public void parseBaseName() { 501 if (baseName != null) { 502 set(0, baseName); 503 } else { 504 reset(); 505 parseLanguage(); 506 parseScript(); 507 parseCountry(); 508 parseVariant(); 509 510 // catch unwanted trailing underscore after country if there was no variant 511 int len = buffer.length(); 512 if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) { 513 buffer.deleteCharAt(len - 1); 514 } 515 } 516 } 517 518 /** 519 * Returns the normalized base form of the locale id. The base 520 * form does not include keywords. 521 */ getBaseName()522 public String getBaseName() { 523 if (baseName != null) { 524 return baseName; 525 } 526 parseBaseName(); 527 return getString(0); 528 } 529 530 /** 531 * Returns the normalized full form of the locale id. The full 532 * form includes keywords if they are present. 533 */ getName()534 public String getName() { 535 parseBaseName(); 536 parseKeywords(); 537 return getString(0); 538 } 539 540 // keyword utilities 541 542 /** 543 * If we have keywords, advance index to the start of the keywords and return true, 544 * otherwise return false. 545 */ setToKeywordStart()546 private boolean setToKeywordStart() { 547 for (int i = index; i < id.length; ++i) { 548 if (id[i] == KEYWORD_SEPARATOR) { 549 if (canonicalize) { 550 for (int j = ++i; j < id.length; ++j) { // increment i past separator for return 551 if (id[j] == KEYWORD_ASSIGN) { 552 index = i; 553 return true; 554 } 555 } 556 } else { 557 if (++i < id.length) { 558 index = i; 559 return true; 560 } 561 } 562 break; 563 } 564 } 565 return false; 566 } 567 isDoneOrKeywordAssign(char c)568 private static boolean isDoneOrKeywordAssign(char c) { 569 return c == DONE || c == KEYWORD_ASSIGN; 570 } 571 isDoneOrItemSeparator(char c)572 private static boolean isDoneOrItemSeparator(char c) { 573 return c == DONE || c == ITEM_SEPARATOR; 574 } 575 getKeyword()576 private String getKeyword() { 577 int start = index; 578 while (!isDoneOrKeywordAssign(next())) { 579 } 580 --index; 581 return AsciiUtil.toLowerString(new String(id, start, index-start).trim()); 582 } 583 getValue()584 private String getValue() { 585 int start = index; 586 while (!isDoneOrItemSeparator(next())) { 587 } 588 --index; 589 return new String(id, start, index-start).trim(); // leave case alone 590 } 591 getKeyComparator()592 private Comparator<String> getKeyComparator() { 593 final Comparator<String> comp = new Comparator<String>() { 594 @Override 595 public int compare(String lhs, String rhs) { 596 return lhs.compareTo(rhs); 597 } 598 }; 599 return comp; 600 } 601 602 /** 603 * Returns a map of the keywords and values, or null if there are none. 604 */ getKeywordMap()605 public Map<String, String> getKeywordMap() { 606 if (keywords == null) { 607 TreeMap<String, String> m = null; 608 if (setToKeywordStart()) { 609 // trim spaces and convert to lower case, both keywords and values. 610 do { 611 String key = getKeyword(); 612 if (key.length() == 0) { 613 break; 614 } 615 char c = next(); 616 if (c != KEYWORD_ASSIGN) { 617 // throw new IllegalArgumentException("key '" + key + "' missing a value."); 618 if (c == DONE) { 619 break; 620 } else { 621 continue; 622 } 623 } 624 String value = getValue(); 625 if (value.length() == 0) { 626 // throw new IllegalArgumentException("key '" + key + "' missing a value."); 627 continue; 628 } 629 if (m == null) { 630 m = new TreeMap<String, String>(getKeyComparator()); 631 } else if (m.containsKey(key)) { 632 // throw new IllegalArgumentException("key '" + key + "' already has a value."); 633 continue; 634 } 635 m.put(key, value); 636 } while (next() == ITEM_SEPARATOR); 637 } 638 keywords = m != null ? m : Collections.<String, String>emptyMap(); 639 } 640 641 return keywords; 642 } 643 644 645 /** 646 * Parse the keywords and return start of the string in the buffer. 647 */ parseKeywords()648 private int parseKeywords() { 649 int oldBlen = buffer.length(); 650 Map<String, String> m = getKeywordMap(); 651 if (!m.isEmpty()) { 652 boolean first = true; 653 for (Map.Entry<String, String> e : m.entrySet()) { 654 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR); 655 first = false; 656 append(e.getKey()); 657 append(KEYWORD_ASSIGN); 658 append(e.getValue()); 659 } 660 if (first == false) { 661 ++oldBlen; 662 } 663 } 664 return oldBlen; 665 } 666 667 /** 668 * Returns an iterator over the keywords, or null if we have an empty map. 669 */ getKeywords()670 public Iterator<String> getKeywords() { 671 Map<String, String> m = getKeywordMap(); 672 return m.isEmpty() ? null : m.keySet().iterator(); 673 } 674 675 /** 676 * Returns the value for the named keyword, or null if the keyword is not 677 * present. 678 */ getKeywordValue(String keywordName)679 public String getKeywordValue(String keywordName) { 680 Map<String, String> m = getKeywordMap(); 681 return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim())); 682 } 683 684 /** 685 * Set the keyword value only if it is not already set to something else. 686 */ defaultKeywordValue(String keywordName, String value)687 public void defaultKeywordValue(String keywordName, String value) { 688 setKeywordValue(keywordName, value, false); 689 } 690 691 /** 692 * Set the value for the named keyword, or unset it if value is null. If 693 * keywordName itself is null, unset all keywords. If keywordName is not null, 694 * value must not be null. 695 */ setKeywordValue(String keywordName, String value)696 public void setKeywordValue(String keywordName, String value) { 697 setKeywordValue(keywordName, value, true); 698 } 699 700 /** 701 * Set the value for the named keyword, or unset it if value is null. If 702 * keywordName itself is null, unset all keywords. If keywordName is not null, 703 * value must not be null. If reset is true, ignore any previous value for 704 * the keyword, otherwise do not change the keyword (including removal of 705 * one or all keywords). 706 */ setKeywordValue(String keywordName, String value, boolean reset)707 private void setKeywordValue(String keywordName, String value, boolean reset) { 708 if (keywordName == null) { 709 if (reset) { 710 // force new map, ignore value 711 keywords = Collections.<String, String>emptyMap(); 712 } 713 } else { 714 keywordName = AsciiUtil.toLowerString(keywordName.trim()); 715 if (keywordName.length() == 0) { 716 throw new IllegalArgumentException("keyword must not be empty"); 717 } 718 if (value != null) { 719 value = value.trim(); 720 if (value.length() == 0) { 721 throw new IllegalArgumentException("value must not be empty"); 722 } 723 } 724 Map<String, String> m = getKeywordMap(); 725 if (m.isEmpty()) { // it is EMPTY_MAP 726 if (value != null) { 727 // force new map 728 keywords = new TreeMap<String, String>(getKeyComparator()); 729 keywords.put(keywordName, value.trim()); 730 } 731 } else { 732 if (reset || !m.containsKey(keywordName)) { 733 if (value != null) { 734 m.put(keywordName, value); 735 } else { 736 m.remove(keywordName); 737 if (m.isEmpty()) { 738 // force new map 739 keywords = Collections.<String, String>emptyMap(); 740 } 741 } 742 } 743 } 744 } 745 } 746 } 747