1 /* 2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2010, International Business Machines Corporation and * 29 * others. All Rights Reserved. * 30 ******************************************************************************* 31 */ 32 package sun.util.locale; 33 34 import java.util.ArrayList; 35 import java.util.Collections; 36 import java.util.HashMap; 37 import java.util.List; 38 import java.util.Map; 39 import java.util.Set; 40 41 public class LanguageTag { 42 // 43 // static fields 44 // 45 public static final String SEP = "-"; 46 public static final String PRIVATEUSE = "x"; 47 public static final String UNDETERMINED = "und"; 48 public static final String PRIVUSE_VARIANT_PREFIX = "lvariant"; 49 50 // 51 // Language subtag fields 52 // 53 private String language = ""; // language subtag 54 private String script = ""; // script subtag 55 private String region = ""; // region subtag 56 private String privateuse = ""; // privateuse 57 58 private List<String> extlangs = Collections.emptyList(); // extlang subtags 59 private List<String> variants = Collections.emptyList(); // variant subtags 60 private List<String> extensions = Collections.emptyList(); // extensions 61 62 // Map contains grandfathered tags and its preferred mappings from 63 // http://www.ietf.org/rfc/rfc5646.txt 64 // Keys are lower-case strings. 65 private static final Map<String, String[]> GRANDFATHERED = new HashMap<>(); 66 67 static { 68 // grandfathered = irregular ; non-redundant tags registered 69 // / regular ; during the RFC 3066 era 70 // 71 // irregular = "en-GB-oed" ; irregular tags do not match 72 // / "i-ami" ; the 'langtag' production and 73 // / "i-bnn" ; would not otherwise be 74 // / "i-default" ; considered 'well-formed' 75 // / "i-enochian" ; These tags are all valid, 76 // / "i-hak" ; but most are deprecated 77 // / "i-klingon" ; in favor of more modern 78 // / "i-lux" ; subtags or subtag 79 // / "i-mingo" ; combination 80 // / "i-navajo" 81 // / "i-pwn" 82 // / "i-tao" 83 // / "i-tay" 84 // / "i-tsu" 85 // / "sgn-BE-FR" 86 // / "sgn-BE-NL" 87 // / "sgn-CH-DE" 88 // 89 // regular = "art-lojban" ; these tags match the 'langtag' 90 // / "cel-gaulish" ; production, but their subtags 91 // / "no-bok" ; are not extended language 92 // / "no-nyn" ; or variant subtags: their meaning 93 // / "zh-guoyu" ; is defined by their registration 94 // / "zh-hakka" ; and all of these are deprecated 95 // / "zh-min" ; in favor of a more modern 96 // / "zh-min-nan" ; subtag or sequence of subtags 97 // / "zh-xiang" 98 99 final String[][] entries = { 100 //{"tag", "preferred"}, 101 {"art-lojban", "jbo"}, 102 {"cel-gaulish", "xtg-x-cel-gaulish"}, // fallback 103 {"en-GB-oed", "en-GB-x-oed"}, // fallback 104 {"i-ami", "ami"}, 105 {"i-bnn", "bnn"}, 106 {"i-default", "en-x-i-default"}, // fallback 107 {"i-enochian", "und-x-i-enochian"}, // fallback 108 {"i-hak", "hak"}, 109 {"i-klingon", "tlh"}, 110 {"i-lux", "lb"}, 111 {"i-mingo", "see-x-i-mingo"}, // fallback 112 {"i-navajo", "nv"}, 113 {"i-pwn", "pwn"}, 114 {"i-tao", "tao"}, 115 {"i-tay", "tay"}, 116 {"i-tsu", "tsu"}, 117 {"no-bok", "nb"}, 118 {"no-nyn", "nn"}, 119 {"sgn-BE-FR", "sfb"}, 120 {"sgn-BE-NL", "vgt"}, 121 {"sgn-CH-DE", "sgg"}, 122 {"zh-guoyu", "cmn"}, 123 {"zh-hakka", "hak"}, 124 {"zh-min", "nan-x-zh-min"}, // fallback 125 {"zh-min-nan", "nan"}, 126 {"zh-xiang", "hsn"}, 127 }; 128 for (String[] e : entries) { LocaleUtils.toLowerString(e[0])129 GRANDFATHERED.put(LocaleUtils.toLowerString(e[0]), e); 130 } 131 } 132 LanguageTag()133 private LanguageTag() { 134 } 135 136 /* 137 * BNF in RFC5646 138 * 139 * Language-Tag = langtag ; normal language tags 140 * / privateuse ; private use tag 141 * / grandfathered ; grandfathered tags 142 * 143 * 144 * langtag = language 145 * ["-" script] 146 * ["-" region] 147 * *("-" variant) 148 * *("-" extension) 149 * ["-" privateuse] 150 * 151 * language = 2*3ALPHA ; shortest ISO 639 code 152 * ["-" extlang] ; sometimes followed by 153 * ; extended language subtags 154 * / 4ALPHA ; or reserved for future use 155 * / 5*8ALPHA ; or registered language subtag 156 * 157 * extlang = 3ALPHA ; selected ISO 639 codes 158 * *2("-" 3ALPHA) ; permanently reserved 159 * 160 * script = 4ALPHA ; ISO 15924 code 161 * 162 * region = 2ALPHA ; ISO 3166-1 code 163 * / 3DIGIT ; UN M.49 code 164 * 165 * variant = 5*8alphanum ; registered variants 166 * / (DIGIT 3alphanum) 167 * 168 * extension = singleton 1*("-" (2*8alphanum)) 169 * 170 * ; Single alphanumerics 171 * ; "x" reserved for private use 172 * singleton = DIGIT ; 0 - 9 173 * / %x41-57 ; A - W 174 * / %x59-5A ; Y - Z 175 * / %x61-77 ; a - w 176 * / %x79-7A ; y - z 177 * 178 * privateuse = "x" 1*("-" (1*8alphanum)) 179 * 180 */ parse(String languageTag, ParseStatus sts)181 public static LanguageTag parse(String languageTag, ParseStatus sts) { 182 if (sts == null) { 183 sts = new ParseStatus(); 184 } else { 185 sts.reset(); 186 } 187 188 StringTokenIterator itr; 189 190 // Check if the tag is grandfathered 191 String[] gfmap = GRANDFATHERED.get(LocaleUtils.toLowerString(languageTag)); 192 if (gfmap != null) { 193 // use preferred mapping 194 itr = new StringTokenIterator(gfmap[1], SEP); 195 } else { 196 itr = new StringTokenIterator(languageTag, SEP); 197 } 198 199 LanguageTag tag = new LanguageTag(); 200 201 // langtag must start with either language or privateuse 202 if (tag.parseLanguage(itr, sts)) { 203 tag.parseExtlangs(itr, sts); 204 tag.parseScript(itr, sts); 205 tag.parseRegion(itr, sts); 206 tag.parseVariants(itr, sts); 207 tag.parseExtensions(itr, sts); 208 } 209 tag.parsePrivateuse(itr, sts); 210 211 if (!itr.isDone() && !sts.isError()) { 212 String s = itr.current(); 213 sts.errorIndex = itr.currentStart(); 214 if (s.length() == 0) { 215 sts.errorMsg = "Empty subtag"; 216 } else { 217 sts.errorMsg = "Invalid subtag: " + s; 218 } 219 } 220 221 return tag; 222 } 223 224 // 225 // Language subtag parsers 226 // 227 parseLanguage(StringTokenIterator itr, ParseStatus sts)228 private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) { 229 if (itr.isDone() || sts.isError()) { 230 return false; 231 } 232 233 boolean found = false; 234 235 String s = itr.current(); 236 if (isLanguage(s)) { 237 found = true; 238 language = s; 239 sts.parseLength = itr.currentEnd(); 240 itr.next(); 241 } 242 243 return found; 244 } 245 parseExtlangs(StringTokenIterator itr, ParseStatus sts)246 private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) { 247 if (itr.isDone() || sts.isError()) { 248 return false; 249 } 250 251 boolean found = false; 252 253 while (!itr.isDone()) { 254 String s = itr.current(); 255 if (!isExtlang(s)) { 256 break; 257 } 258 found = true; 259 if (extlangs.isEmpty()) { 260 extlangs = new ArrayList<>(3); 261 } 262 extlangs.add(s); 263 sts.parseLength = itr.currentEnd(); 264 itr.next(); 265 266 if (extlangs.size() == 3) { 267 // Maximum 3 extlangs 268 break; 269 } 270 } 271 272 return found; 273 } 274 parseScript(StringTokenIterator itr, ParseStatus sts)275 private boolean parseScript(StringTokenIterator itr, ParseStatus sts) { 276 if (itr.isDone() || sts.isError()) { 277 return false; 278 } 279 280 boolean found = false; 281 282 String s = itr.current(); 283 if (isScript(s)) { 284 found = true; 285 script = s; 286 sts.parseLength = itr.currentEnd(); 287 itr.next(); 288 } 289 290 return found; 291 } 292 parseRegion(StringTokenIterator itr, ParseStatus sts)293 private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) { 294 if (itr.isDone() || sts.isError()) { 295 return false; 296 } 297 298 boolean found = false; 299 300 String s = itr.current(); 301 if (isRegion(s)) { 302 found = true; 303 region = s; 304 sts.parseLength = itr.currentEnd(); 305 itr.next(); 306 } 307 308 return found; 309 } 310 parseVariants(StringTokenIterator itr, ParseStatus sts)311 private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) { 312 if (itr.isDone() || sts.isError()) { 313 return false; 314 } 315 316 boolean found = false; 317 318 while (!itr.isDone()) { 319 String s = itr.current(); 320 if (!isVariant(s)) { 321 break; 322 } 323 found = true; 324 if (variants.isEmpty()) { 325 variants = new ArrayList<>(3); 326 } 327 variants.add(s); 328 sts.parseLength = itr.currentEnd(); 329 itr.next(); 330 } 331 332 return found; 333 } 334 parseExtensions(StringTokenIterator itr, ParseStatus sts)335 private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) { 336 if (itr.isDone() || sts.isError()) { 337 return false; 338 } 339 340 boolean found = false; 341 342 while (!itr.isDone()) { 343 String s = itr.current(); 344 if (isExtensionSingleton(s)) { 345 int start = itr.currentStart(); 346 String singleton = s; 347 StringBuilder sb = new StringBuilder(singleton); 348 349 itr.next(); 350 while (!itr.isDone()) { 351 s = itr.current(); 352 if (isExtensionSubtag(s)) { 353 sb.append(SEP).append(s); 354 sts.parseLength = itr.currentEnd(); 355 } else { 356 break; 357 } 358 itr.next(); 359 } 360 361 if (sts.parseLength <= start) { 362 sts.errorIndex = start; 363 sts.errorMsg = "Incomplete extension '" + singleton + "'"; 364 break; 365 } 366 367 if (extensions.isEmpty()) { 368 extensions = new ArrayList<>(4); 369 } 370 extensions.add(sb.toString()); 371 found = true; 372 } else { 373 break; 374 } 375 } 376 return found; 377 } 378 parsePrivateuse(StringTokenIterator itr, ParseStatus sts)379 private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) { 380 if (itr.isDone() || sts.isError()) { 381 return false; 382 } 383 384 boolean found = false; 385 386 String s = itr.current(); 387 if (isPrivateusePrefix(s)) { 388 int start = itr.currentStart(); 389 StringBuilder sb = new StringBuilder(s); 390 391 itr.next(); 392 while (!itr.isDone()) { 393 s = itr.current(); 394 if (!isPrivateuseSubtag(s)) { 395 break; 396 } 397 sb.append(SEP).append(s); 398 sts.parseLength = itr.currentEnd(); 399 400 itr.next(); 401 } 402 403 if (sts.parseLength <= start) { 404 // need at least 1 private subtag 405 sts.errorIndex = start; 406 sts.errorMsg = "Incomplete privateuse"; 407 } else { 408 privateuse = sb.toString(); 409 found = true; 410 } 411 } 412 413 return found; 414 } 415 parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions)416 public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) { 417 LanguageTag tag = new LanguageTag(); 418 419 String language = baseLocale.getLanguage(); 420 String script = baseLocale.getScript(); 421 String region = baseLocale.getRegion(); 422 String variant = baseLocale.getVariant(); 423 424 boolean hasSubtag = false; 425 426 String privuseVar = null; // store ill-formed variant subtags 427 428 if (isLanguage(language)) { 429 // Convert a deprecated language code to its new code 430 if (language.equals("iw")) { 431 language = "he"; 432 } else if (language.equals("ji")) { 433 language = "yi"; 434 } else if (language.equals("in")) { 435 language = "id"; 436 } 437 tag.language = language; 438 } 439 440 if (isScript(script)) { 441 tag.script = canonicalizeScript(script); 442 hasSubtag = true; 443 } 444 445 if (isRegion(region)) { 446 tag.region = canonicalizeRegion(region); 447 hasSubtag = true; 448 } 449 450 // Special handling for no_NO_NY - use nn_NO for language tag 451 if (tag.language.equals("no") && tag.region.equals("NO") && variant.equals("NY")) { 452 tag.language = "nn"; 453 variant = ""; 454 } 455 456 if (variant.length() > 0) { 457 List<String> variants = null; 458 StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP); 459 while (!varitr.isDone()) { 460 String var = varitr.current(); 461 if (!isVariant(var)) { 462 break; 463 } 464 if (variants == null) { 465 variants = new ArrayList<>(); 466 } 467 variants.add(var); // Do not canonicalize! 468 varitr.next(); 469 } 470 if (variants != null) { 471 tag.variants = variants; 472 hasSubtag = true; 473 } 474 if (!varitr.isDone()) { 475 // ill-formed variant subtags 476 StringBuilder buf = new StringBuilder(); 477 while (!varitr.isDone()) { 478 String prvv = varitr.current(); 479 if (!isPrivateuseSubtag(prvv)) { 480 // cannot use private use subtag - truncated 481 break; 482 } 483 if (buf.length() > 0) { 484 buf.append(SEP); 485 } 486 buf.append(prvv); 487 varitr.next(); 488 } 489 if (buf.length() > 0) { 490 privuseVar = buf.toString(); 491 } 492 } 493 } 494 495 List<String> extensions = null; 496 String privateuse = null; 497 498 if (localeExtensions != null) { 499 Set<Character> locextKeys = localeExtensions.getKeys(); 500 for (Character locextKey : locextKeys) { 501 Extension ext = localeExtensions.getExtension(locextKey); 502 if (isPrivateusePrefixChar(locextKey)) { 503 privateuse = ext.getValue(); 504 } else { 505 if (extensions == null) { 506 extensions = new ArrayList<>(); 507 } 508 extensions.add(locextKey.toString() + SEP + ext.getValue()); 509 } 510 } 511 } 512 513 if (extensions != null) { 514 tag.extensions = extensions; 515 hasSubtag = true; 516 } 517 518 // append ill-formed variant subtags to private use 519 if (privuseVar != null) { 520 if (privateuse == null) { 521 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar; 522 } else { 523 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX 524 + SEP + privuseVar.replace(BaseLocale.SEP, SEP); 525 } 526 } 527 528 if (privateuse != null) { 529 tag.privateuse = privateuse; 530 } 531 532 if (tag.language.length() == 0 && (hasSubtag || privateuse == null)) { 533 // use lang "und" when 1) no language is available AND 534 // 2) any of other subtags other than private use are available or 535 // no private use tag is available 536 tag.language = UNDETERMINED; 537 } 538 539 return tag; 540 } 541 542 // 543 // Getter methods for language subtag fields 544 // 545 getLanguage()546 public String getLanguage() { 547 return language; 548 } 549 getExtlangs()550 public List<String> getExtlangs() { 551 if (extlangs.isEmpty()) { 552 return Collections.emptyList(); 553 } 554 return Collections.unmodifiableList(extlangs); 555 } 556 getScript()557 public String getScript() { 558 return script; 559 } 560 getRegion()561 public String getRegion() { 562 return region; 563 } 564 getVariants()565 public List<String> getVariants() { 566 if (variants.isEmpty()) { 567 return Collections.emptyList(); 568 } 569 return Collections.unmodifiableList(variants); 570 } 571 getExtensions()572 public List<String> getExtensions() { 573 if (extensions.isEmpty()) { 574 return Collections.emptyList(); 575 } 576 return Collections.unmodifiableList(extensions); 577 } 578 getPrivateuse()579 public String getPrivateuse() { 580 return privateuse; 581 } 582 583 // 584 // Language subtag syntax checking methods 585 // 586 isLanguage(String s)587 public static boolean isLanguage(String s) { 588 // language = 2*3ALPHA ; shortest ISO 639 code 589 // ["-" extlang] ; sometimes followed by 590 // ; extended language subtags 591 // / 4ALPHA ; or reserved for future use 592 // / 5*8ALPHA ; or registered language subtag 593 int len = s.length(); 594 return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaString(s); 595 } 596 isExtlang(String s)597 public static boolean isExtlang(String s) { 598 // extlang = 3ALPHA ; selected ISO 639 codes 599 // *2("-" 3ALPHA) ; permanently reserved 600 return (s.length() == 3) && LocaleUtils.isAlphaString(s); 601 } 602 isScript(String s)603 public static boolean isScript(String s) { 604 // script = 4ALPHA ; ISO 15924 code 605 return (s.length() == 4) && LocaleUtils.isAlphaString(s); 606 } 607 isRegion(String s)608 public static boolean isRegion(String s) { 609 // region = 2ALPHA ; ISO 3166-1 code 610 // / 3DIGIT ; UN M.49 code 611 return ((s.length() == 2) && LocaleUtils.isAlphaString(s)) 612 || ((s.length() == 3) && LocaleUtils.isNumericString(s)); 613 } 614 isVariant(String s)615 public static boolean isVariant(String s) { 616 // variant = 5*8alphanum ; registered variants 617 // / (DIGIT 3alphanum) 618 int len = s.length(); 619 if (len >= 5 && len <= 8) { 620 return LocaleUtils.isAlphaNumericString(s); 621 } 622 if (len == 4) { 623 return LocaleUtils.isNumeric(s.charAt(0)) 624 && LocaleUtils.isAlphaNumeric(s.charAt(1)) 625 && LocaleUtils.isAlphaNumeric(s.charAt(2)) 626 && LocaleUtils.isAlphaNumeric(s.charAt(3)); 627 } 628 return false; 629 } 630 isExtensionSingleton(String s)631 public static boolean isExtensionSingleton(String s) { 632 // singleton = DIGIT ; 0 - 9 633 // / %x41-57 ; A - W 634 // / %x59-5A ; Y - Z 635 // / %x61-77 ; a - w 636 // / %x79-7A ; y - z 637 638 return (s.length() == 1) 639 && LocaleUtils.isAlphaString(s) 640 && !LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s); 641 } 642 isExtensionSingletonChar(char c)643 public static boolean isExtensionSingletonChar(char c) { 644 return isExtensionSingleton(String.valueOf(c)); 645 } 646 isExtensionSubtag(String s)647 public static boolean isExtensionSubtag(String s) { 648 // extension = singleton 1*("-" (2*8alphanum)) 649 int len = s.length(); 650 return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaNumericString(s); 651 } 652 isPrivateusePrefix(String s)653 public static boolean isPrivateusePrefix(String s) { 654 // privateuse = "x" 1*("-" (1*8alphanum)) 655 return (s.length() == 1) 656 && LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s); 657 } 658 isPrivateusePrefixChar(char c)659 public static boolean isPrivateusePrefixChar(char c) { 660 return (LocaleUtils.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c))); 661 } 662 isPrivateuseSubtag(String s)663 public static boolean isPrivateuseSubtag(String s) { 664 // privateuse = "x" 1*("-" (1*8alphanum)) 665 int len = s.length(); 666 return (len >= 1) && (len <= 8) && LocaleUtils.isAlphaNumericString(s); 667 } 668 669 // 670 // Language subtag canonicalization methods 671 // 672 canonicalizeLanguage(String s)673 public static String canonicalizeLanguage(String s) { 674 return LocaleUtils.toLowerString(s); 675 } 676 canonicalizeExtlang(String s)677 public static String canonicalizeExtlang(String s) { 678 return LocaleUtils.toLowerString(s); 679 } 680 canonicalizeScript(String s)681 public static String canonicalizeScript(String s) { 682 return LocaleUtils.toTitleString(s); 683 } 684 canonicalizeRegion(String s)685 public static String canonicalizeRegion(String s) { 686 return LocaleUtils.toUpperString(s); 687 } 688 canonicalizeVariant(String s)689 public static String canonicalizeVariant(String s) { 690 return LocaleUtils.toLowerString(s); 691 } 692 canonicalizeExtension(String s)693 public static String canonicalizeExtension(String s) { 694 return LocaleUtils.toLowerString(s); 695 } 696 canonicalizeExtensionSingleton(String s)697 public static String canonicalizeExtensionSingleton(String s) { 698 return LocaleUtils.toLowerString(s); 699 } 700 canonicalizeExtensionSubtag(String s)701 public static String canonicalizeExtensionSubtag(String s) { 702 return LocaleUtils.toLowerString(s); 703 } 704 canonicalizePrivateuse(String s)705 public static String canonicalizePrivateuse(String s) { 706 return LocaleUtils.toLowerString(s); 707 } 708 canonicalizePrivateuseSubtag(String s)709 public static String canonicalizePrivateuseSubtag(String s) { 710 return LocaleUtils.toLowerString(s); 711 } 712 713 @Override toString()714 public String toString() { 715 StringBuilder sb = new StringBuilder(); 716 717 if (language.length() > 0) { 718 sb.append(language); 719 720 for (String extlang : extlangs) { 721 sb.append(SEP).append(extlang); 722 } 723 724 if (script.length() > 0) { 725 sb.append(SEP).append(script); 726 } 727 728 if (region.length() > 0) { 729 sb.append(SEP).append(region); 730 } 731 732 for (String variant : variants) { 733 sb.append(SEP).append(variant); 734 } 735 736 for (String extension : extensions) { 737 sb.append(SEP).append(extension); 738 } 739 } 740 if (privateuse.length() > 0) { 741 if (sb.length() > 0) { 742 sb.append(SEP); 743 } 744 sb.append(privateuse); 745 } 746 747 return sb.toString(); 748 } 749 } 750