1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2013, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl.locale; 10 11 import java.util.ArrayList; 12 import java.util.Collections; 13 import java.util.HashMap; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Set; 17 18 public class LanguageTag { 19 private static final boolean JDKIMPL = false; 20 21 // 22 // static fields 23 // 24 public static final String SEP = "-"; 25 public static final String PRIVATEUSE = "x"; 26 public static String UNDETERMINED = "und"; 27 public static final String PRIVUSE_VARIANT_PREFIX = "lvariant"; 28 29 // 30 // Language subtag fields 31 // 32 private String _language = ""; // language subtag 33 private String _script = ""; // script subtag 34 private String _region = ""; // region subtag 35 private String _privateuse = ""; // privateuse 36 37 private List<String> _extlangs = Collections.emptyList(); // extlang subtags 38 private List<String> _variants = Collections.emptyList(); // variant subtags 39 private List<String> _extensions = Collections.emptyList(); // extensions 40 41 // The Map contains legacy language tags (marked as “Type: grandfathered” in BCP 47) 42 // and their preferred mappings from BCP 47. 43 private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> LEGACY = 44 new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>(); 45 46 static { 47 // legacy = irregular ; non-redundant tags registered 48 // / regular ; during the RFC 3066 era 49 // 50 // irregular = "en-GB-oed" ; irregular tags do not match 51 // / "i-ami" ; the 'langtag' production and 52 // / "i-bnn" ; would not otherwise be 53 // / "i-default" ; considered 'well-formed' 54 // / "i-enochian" ; These tags are all valid, 55 // / "i-hak" ; but most are deprecated 56 // / "i-klingon" ; in favor of more modern 57 // / "i-lux" ; subtags or subtag 58 // / "i-mingo" ; combination 59 // / "i-navajo" 60 // / "i-pwn" 61 // / "i-tao" 62 // / "i-tay" 63 // / "i-tsu" 64 // / "sgn-BE-FR" 65 // / "sgn-BE-NL" 66 // / "sgn-CH-DE" 67 // 68 // regular = "art-lojban" ; these tags match the 'langtag' 69 // / "cel-gaulish" ; production, but their subtags 70 // / "no-bok" ; are not extended language 71 // / "no-nyn" ; or variant subtags: their meaning 72 // / "zh-guoyu" ; is defined by their registration 73 // / "zh-hakka" ; and all of these are deprecated 74 // / "zh-min" ; in favor of a more modern 75 // / "zh-min-nan" ; subtag or sequence of subtags 76 // / "zh-xiang" 77 78 final String[][] entries = { 79 //{"tag", "preferred"}, 80 {"art-lojban", "jbo"}, 81 {"cel-gaulish", "xtg"}, // fallback 82 {"en-GB-oed", "en-GB-x-oed"}, // fallback 83 {"i-ami", "ami"}, 84 {"i-bnn", "bnn"}, 85 {"i-default", "en-x-i-default"}, // fallback 86 {"i-enochian", "und-x-i-enochian"}, // fallback 87 {"i-hak", "hak"}, 88 {"i-klingon", "tlh"}, 89 {"i-lux", "lb"}, 90 {"i-mingo", "see-x-i-mingo"}, // fallback 91 {"i-navajo", "nv"}, 92 {"i-pwn", "pwn"}, 93 {"i-tao", "tao"}, 94 {"i-tay", "tay"}, 95 {"i-tsu", "tsu"}, 96 {"no-bok", "nb"}, 97 {"no-nyn", "nn"}, 98 {"sgn-BE-FR", "sfb"}, 99 {"sgn-BE-NL", "vgt"}, 100 {"sgn-CH-DE", "sgg"}, 101 {"zh-guoyu", "cmn"}, 102 {"zh-hakka", "hak"}, 103 {"zh-min", "nan-x-zh-min"}, // fallback 104 {"zh-min-nan", "nan"}, 105 {"zh-xiang", "hsn"}, 106 }; 107 for (String[] e : entries) { LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e)108 LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); 109 } 110 } 111 LanguageTag()112 private LanguageTag() { 113 } 114 115 /** 116 * See BCP 47 “Tags for Identifying Languages”: 117 * https://www.rfc-editor.org/info/bcp47 --> 118 * https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1 119 */ parse(String languageTag, ParseStatus sts)120 public static LanguageTag parse(String languageTag, ParseStatus sts) { 121 if (sts == null) { 122 sts = new ParseStatus(); 123 } else { 124 sts.reset(); 125 } 126 127 StringTokenIterator itr; 128 boolean isLegacy = false; 129 130 String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); 131 // Language tag is at least 2 alpha so we can skip searching the first 2 chars. 132 int dash = 2; 133 while (gfmap == null && (dash = languageTag.indexOf('-', dash + 1)) != -1) { 134 gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash))); 135 } 136 137 if (gfmap != null) { 138 if (gfmap[0].length() == languageTag.length()) { 139 // use preferred mapping 140 itr = new StringTokenIterator(gfmap[1], SEP); 141 } else { 142 // append the rest of the tag. 143 itr = new StringTokenIterator(gfmap[1] + languageTag.substring(dash), SEP); 144 } 145 isLegacy = true; 146 } else { 147 itr = new StringTokenIterator(languageTag, SEP); 148 } 149 150 LanguageTag tag = new LanguageTag(); 151 152 // langtag must start with either language or privateuse 153 if (tag.parseLanguage(itr, sts)) { 154 // ExtLang can only be preceded by 2-3 letter language subtag. 155 if (tag._language.length() <= 3) 156 tag.parseExtlangs(itr, sts); 157 tag.parseScript(itr, sts); 158 tag.parseRegion(itr, sts); 159 tag.parseVariants(itr, sts); 160 tag.parseExtensions(itr, sts); 161 } 162 tag.parsePrivateuse(itr, sts); 163 164 if (isLegacy) { 165 // A legacy tag is replaced with a well-formed tag above. 166 // However, the parsed length must be the original tag length. 167 assert (itr.isDone()); 168 assert (!sts.isError()); 169 sts._parseLength = languageTag.length(); 170 } else if (!itr.isDone() && !sts.isError()) { 171 String s = itr.current(); 172 sts._errorIndex = itr.currentStart(); 173 if (s.length() == 0) { 174 sts._errorMsg = "Empty subtag"; 175 } else { 176 sts._errorMsg = "Invalid subtag: " + s; 177 } 178 } 179 180 return tag; 181 } 182 183 // 184 // Language subtag parsers 185 // 186 parseLanguage(StringTokenIterator itr, ParseStatus sts)187 private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) { 188 if (itr.isDone() || sts.isError()) { 189 return false; 190 } 191 192 boolean found = false; 193 194 String s = itr.current(); 195 if (isLanguage(s)) { 196 found = true; 197 _language = s; 198 sts._parseLength = itr.currentEnd(); 199 itr.next(); 200 } 201 202 return found; 203 } 204 parseExtlangs(StringTokenIterator itr, ParseStatus sts)205 private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) { 206 if (itr.isDone() || sts.isError()) { 207 return false; 208 } 209 210 boolean found = false; 211 212 while (!itr.isDone()) { 213 String s = itr.current(); 214 if (!isExtlang(s)) { 215 break; 216 } 217 found = true; 218 if (_extlangs.isEmpty()) { 219 _extlangs = new ArrayList<String>(3); 220 } 221 _extlangs.add(s); 222 sts._parseLength = itr.currentEnd(); 223 itr.next(); 224 225 if (_extlangs.size() == 3) { 226 // Maximum 3 extlangs 227 break; 228 } 229 } 230 231 return found; 232 } 233 parseScript(StringTokenIterator itr, ParseStatus sts)234 private boolean parseScript(StringTokenIterator itr, ParseStatus sts) { 235 if (itr.isDone() || sts.isError()) { 236 return false; 237 } 238 239 boolean found = false; 240 241 String s = itr.current(); 242 if (isScript(s)) { 243 found = true; 244 _script = s; 245 sts._parseLength = itr.currentEnd(); 246 itr.next(); 247 } 248 249 return found; 250 } 251 parseRegion(StringTokenIterator itr, ParseStatus sts)252 private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) { 253 if (itr.isDone() || sts.isError()) { 254 return false; 255 } 256 257 boolean found = false; 258 259 String s = itr.current(); 260 if (isRegion(s)) { 261 found = true; 262 _region = s; 263 sts._parseLength = itr.currentEnd(); 264 itr.next(); 265 } 266 267 return found; 268 } 269 parseVariants(StringTokenIterator itr, ParseStatus sts)270 private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) { 271 if (itr.isDone() || sts.isError()) { 272 return false; 273 } 274 275 boolean found = false; 276 277 while (!itr.isDone()) { 278 String s = itr.current(); 279 if (!isVariant(s)) { 280 break; 281 } 282 found = true; 283 if (_variants.isEmpty()) { 284 _variants = new ArrayList<String>(3); 285 } 286 // Ignore repeated variant 287 s = s.toUpperCase(); 288 if (!_variants.contains(s)) { 289 _variants.add(s); 290 } 291 sts._parseLength = itr.currentEnd(); 292 itr.next(); 293 } 294 295 return found; 296 } 297 parseExtensions(StringTokenIterator itr, ParseStatus sts)298 private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) { 299 if (itr.isDone() || sts.isError()) { 300 return false; 301 } 302 303 boolean found = false; 304 305 while (!itr.isDone()) { 306 String s = itr.current(); 307 if (isExtensionSingleton(s)) { 308 int start = itr.currentStart(); 309 String singleton = s.toLowerCase(); 310 StringBuilder sb = new StringBuilder(singleton); 311 312 itr.next(); 313 while (!itr.isDone()) { 314 s = itr.current(); 315 if (isExtensionSubtag(s)) { 316 sb.append(SEP).append(s); 317 sts._parseLength = itr.currentEnd(); 318 } else { 319 break; 320 } 321 itr.next(); 322 } 323 324 if (sts._parseLength <= start) { 325 sts._errorIndex = start; 326 sts._errorMsg = "Incomplete extension '" + singleton + "'"; 327 break; 328 } 329 330 if (_extensions.size() == 0) { 331 _extensions = new ArrayList<String>(4); 332 } 333 // Ignore the extension if it is already in _extensions. 334 boolean alreadyHas = false; 335 for (String extension : _extensions) { 336 alreadyHas |= extension.charAt(0) == sb.charAt(0); 337 } 338 if (!alreadyHas) { 339 _extensions.add(sb.toString()); 340 } 341 found = true; 342 } else { 343 break; 344 } 345 } 346 return found; 347 } 348 parsePrivateuse(StringTokenIterator itr, ParseStatus sts)349 private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) { 350 if (itr.isDone() || sts.isError()) { 351 return false; 352 } 353 354 boolean found = false; 355 356 String s = itr.current(); 357 if (isPrivateusePrefix(s)) { 358 int start = itr.currentStart(); 359 StringBuilder sb = new StringBuilder(s); 360 361 itr.next(); 362 while (!itr.isDone()) { 363 s = itr.current(); 364 if (!isPrivateuseSubtag(s)) { 365 break; 366 } 367 sb.append(SEP).append(s); 368 sts._parseLength = itr.currentEnd(); 369 370 itr.next(); 371 } 372 373 if (sts._parseLength <= start) { 374 // need at least 1 private subtag 375 sts._errorIndex = start; 376 sts._errorMsg = "Incomplete privateuse"; 377 } else { 378 _privateuse = sb.toString(); 379 found = true; 380 } 381 } 382 383 return found; 384 } 385 parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions)386 public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) { 387 LanguageTag tag = new LanguageTag(); 388 389 String language = baseLocale.getLanguage(); 390 String script = baseLocale.getScript(); 391 String region = baseLocale.getRegion(); 392 String variant = baseLocale.getVariant(); 393 394 boolean hasSubtag = false; 395 396 String privuseVar = null; // store ill-formed variant subtags 397 398 if (language.length() > 0 && isLanguage(language)) { 399 // Convert a deprecated language code used by Java to 400 // a new code 401 if (language.equals("iw")) { 402 language = "he"; 403 } else if (language.equals("ji")) { 404 language = "yi"; 405 } else if (language.equals("in")) { 406 language = "id"; 407 } 408 tag._language = language; 409 } 410 411 if (script.length() > 0 && isScript(script)) { 412 tag._script = canonicalizeScript(script); 413 hasSubtag = true; 414 } 415 416 if (region.length() > 0 && isRegion(region)) { 417 tag._region = canonicalizeRegion(region); 418 hasSubtag = true; 419 } 420 421 if (JDKIMPL) { 422 // Special handling for no_NO_NY - use nn_NO for language tag 423 if (tag._language.equals("no") && tag._region.equals("NO") && variant.equals("NY")) { 424 tag._language = "nn"; 425 variant = ""; 426 } 427 } 428 429 if (variant.length() > 0) { 430 List<String> variants = null; 431 StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP); 432 while (!varitr.isDone()) { 433 String var = varitr.current(); 434 if (!isVariant(var)) { 435 break; 436 } 437 if (variants == null) { 438 variants = new ArrayList<String>(); 439 } 440 if (JDKIMPL) { 441 variants.add(var); // Do not canonicalize! 442 } else { 443 variants.add(canonicalizeVariant(var)); 444 } 445 varitr.next(); 446 } 447 if (variants != null) { 448 tag._variants = variants; 449 hasSubtag = true; 450 } 451 if (!varitr.isDone()) { 452 // ill-formed variant subtags 453 StringBuilder buf = new StringBuilder(); 454 while (!varitr.isDone()) { 455 String prvv = varitr.current(); 456 if (!isPrivateuseSubtag(prvv)) { 457 // cannot use private use subtag - truncated 458 break; 459 } 460 if (buf.length() > 0) { 461 buf.append(SEP); 462 } 463 if (!JDKIMPL) { 464 prvv = AsciiUtil.toLowerString(prvv); 465 } 466 buf.append(prvv); 467 varitr.next(); 468 } 469 if (buf.length() > 0) { 470 privuseVar = buf.toString(); 471 } 472 } 473 } 474 475 List<String> extensions = null; 476 String privateuse = null; 477 478 Set<Character> locextKeys = localeExtensions.getKeys(); 479 for (Character locextKey : locextKeys) { 480 Extension ext = localeExtensions.getExtension(locextKey); 481 if (isPrivateusePrefixChar(locextKey.charValue())) { 482 privateuse = ext.getValue(); 483 } else { 484 if (extensions == null) { 485 extensions = new ArrayList<String>(); 486 } 487 extensions.add(locextKey.toString() + SEP + ext.getValue()); 488 } 489 } 490 491 if (extensions != null) { 492 tag._extensions = extensions; 493 hasSubtag = true; 494 } 495 496 // append ill-formed variant subtags to private use 497 if (privuseVar != null) { 498 if (privateuse == null) { 499 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar; 500 } else { 501 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX + SEP + privuseVar.replace(BaseLocale.SEP, SEP); 502 } 503 } 504 505 if (privateuse != null) { 506 tag._privateuse = privateuse; 507 } 508 509 if (tag._language.length() == 0 && (hasSubtag || privateuse == null)) { 510 // use lang "und" when 1) no language is available AND 511 // 2) any of other subtags other than private use are available or 512 // no private use tag is available 513 tag._language = UNDETERMINED; 514 } 515 516 return tag; 517 } 518 519 // 520 // Getter methods for language subtag fields 521 // 522 getLanguage()523 public String getLanguage() { 524 return _language; 525 } 526 getExtlangs()527 public List<String> getExtlangs() { 528 return Collections.unmodifiableList(_extlangs); 529 } 530 getScript()531 public String getScript() { 532 return _script; 533 } 534 getRegion()535 public String getRegion() { 536 return _region; 537 } 538 getVariants()539 public List<String> getVariants() { 540 return Collections.unmodifiableList(_variants); 541 } 542 getExtensions()543 public List<String> getExtensions() { 544 return Collections.unmodifiableList(_extensions); 545 } 546 getPrivateuse()547 public String getPrivateuse() { 548 return _privateuse; 549 } 550 551 // 552 // Language subtag syntax checking methods 553 // 554 isLanguage(String s)555 public static boolean isLanguage(String s) { 556 // language = 2*3ALPHA ; shortest ISO 639 code 557 // ["-" extlang] ; sometimes followed by 558 // ; extended language subtags 559 // / 4ALPHA ; or reserved for future use 560 // / 5*8ALPHA ; or registered language subtag 561 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaString(s); 562 } 563 isExtlang(String s)564 public static boolean isExtlang(String s) { 565 // extlang = 3ALPHA ; selected ISO 639 codes 566 // *2("-" 3ALPHA) ; permanently reserved 567 return (s.length() == 3) && AsciiUtil.isAlphaString(s); 568 } 569 isScript(String s)570 public static boolean isScript(String s) { 571 // script = 4ALPHA ; ISO 15924 code 572 return (s.length() == 4) && AsciiUtil.isAlphaString(s); 573 } 574 isRegion(String s)575 public static boolean isRegion(String s) { 576 // region = 2ALPHA ; ISO 3166-1 code 577 // / 3DIGIT ; UN M.49 code 578 return ((s.length() == 2) && AsciiUtil.isAlphaString(s)) 579 || ((s.length() == 3) && AsciiUtil.isNumericString(s)); 580 } 581 isVariant(String s)582 public static boolean isVariant(String s) { 583 // variant = 5*8alphanum ; registered variants 584 // / (DIGIT 3alphanum) 585 int len = s.length(); 586 if (len >= 5 && len <= 8) { 587 return AsciiUtil.isAlphaNumericString(s); 588 } 589 if (len == 4) { 590 return AsciiUtil.isNumeric(s.charAt(0)) 591 && AsciiUtil.isAlphaNumeric(s.charAt(1)) 592 && AsciiUtil.isAlphaNumeric(s.charAt(2)) 593 && AsciiUtil.isAlphaNumeric(s.charAt(3)); 594 } 595 return false; 596 } 597 isTKey(String s)598 public static boolean isTKey(String s) { 599 // tkey = = alpha digit ; 600 return (s.length() == 2) && AsciiUtil.isAlpha(s.charAt(0)) 601 && AsciiUtil.isNumeric(s.charAt(1)); 602 } 603 isExtensionSingleton(String s)604 public static boolean isExtensionSingleton(String s) { 605 // singleton = DIGIT ; 0 - 9 606 // / %x41-57 ; A - W 607 // / %x59-5A ; Y - Z 608 // / %x61-77 ; a - w 609 // / %x79-7A ; y - z 610 611 return (s.length() == 1) 612 && AsciiUtil.isAlphaNumericString(s) 613 && !AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 614 } 615 isExtensionSingletonChar(char c)616 public static boolean isExtensionSingletonChar(char c) { 617 return isExtensionSingleton(String.valueOf(c)); 618 } 619 isExtensionSubtag(String s)620 public static boolean isExtensionSubtag(String s) { 621 // extension = singleton 1*("-" (2*8alphanum)) 622 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 623 } 624 isPrivateusePrefix(String s)625 public static boolean isPrivateusePrefix(String s) { 626 // privateuse = "x" 1*("-" (1*8alphanum)) 627 return (s.length() == 1) 628 && AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 629 } 630 isPrivateusePrefixChar(char c)631 public static boolean isPrivateusePrefixChar(char c) { 632 return (AsciiUtil.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c))); 633 } 634 isPrivateuseSubtag(String s)635 public static boolean isPrivateuseSubtag(String s) { 636 // privateuse = "x" 1*("-" (1*8alphanum)) 637 return (s.length() >= 1) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 638 } 639 640 // 641 // Language subtag canonicalization methods 642 // 643 canonicalizeLanguage(String s)644 public static String canonicalizeLanguage(String s) { 645 return AsciiUtil.toLowerString(s); 646 } 647 canonicalizeExtlang(String s)648 public static String canonicalizeExtlang(String s) { 649 return AsciiUtil.toLowerString(s); 650 } 651 canonicalizeScript(String s)652 public static String canonicalizeScript(String s) { 653 return AsciiUtil.toTitleString(s); 654 } 655 canonicalizeRegion(String s)656 public static String canonicalizeRegion(String s) { 657 return AsciiUtil.toUpperString(s); 658 } 659 canonicalizeVariant(String s)660 public static String canonicalizeVariant(String s) { 661 return AsciiUtil.toLowerString(s); 662 } 663 canonicalizeExtension(String s)664 public static String canonicalizeExtension(String s) { 665 s = AsciiUtil.toLowerString(s); 666 if (s.startsWith("u-")) { 667 int found; 668 while (s.endsWith("-true")) { 669 s = s.substring(0, s.length() - 5); // length of "-true" is 5 670 } 671 while ((found = s.indexOf("-true-")) > 0) { 672 s = s.substring(0, found) + s.substring(found + 5); // length of "-true" is 5 673 } 674 while (s.endsWith("-yes")) { 675 s = s.substring(0, s.length() - 4); // length of "-yes" is 4 676 } 677 while ((found = s.indexOf("-yes-")) > 0) { 678 s = s.substring(0, found) + s.substring(found + 4); // length of "-yes" is 5 679 } 680 } 681 return s; 682 } 683 canonicalizeExtensionSingleton(String s)684 public static String canonicalizeExtensionSingleton(String s) { 685 return AsciiUtil.toLowerString(s); 686 } 687 canonicalizeExtensionSubtag(String s)688 public static String canonicalizeExtensionSubtag(String s) { 689 return AsciiUtil.toLowerString(s); 690 } 691 canonicalizePrivateuse(String s)692 public static String canonicalizePrivateuse(String s) { 693 return AsciiUtil.toLowerString(s); 694 } 695 canonicalizePrivateuseSubtag(String s)696 public static String canonicalizePrivateuseSubtag(String s) { 697 return AsciiUtil.toLowerString(s); 698 } 699 700 @Override toString()701 public String toString() { 702 StringBuilder sb = new StringBuilder(); 703 704 if (_language.length() > 0) { 705 sb.append(_language); 706 707 for (String extlang : _extlangs) { 708 sb.append(SEP).append(extlang); 709 } 710 711 if (_script.length() > 0) { 712 sb.append(SEP).append(_script); 713 } 714 715 if (_region.length() > 0) { 716 sb.append(SEP).append(_region); 717 } 718 719 for (String variant : _variants) { 720 sb.append(SEP).append(variant); 721 } 722 723 for (String extension : _extensions) { 724 sb.append(SEP).append(extension); 725 } 726 } 727 if (_privateuse.length() > 0) { 728 if (sb.length() > 0) { 729 sb.append(SEP); 730 } 731 sb.append(_privateuse); 732 } 733 734 return sb.toString(); 735 } 736 } 737