1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu; 4 5 import static com.google.common.base.CharMatcher.whitespace; 6 import static com.google.common.base.Preconditions.checkArgument; 7 import static com.google.common.base.Preconditions.checkNotNull; 8 import static com.google.common.base.Preconditions.checkState; 9 import static com.google.common.collect.ImmutableMap.toImmutableMap; 10 import static java.util.function.Function.identity; 11 import static org.unicode.cldr.api.AttributeKey.keyOf; 12 import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY; 13 14 import java.util.Arrays; 15 import java.util.HashMap; 16 import java.util.List; 17 import java.util.Map; 18 import java.util.Objects; 19 import java.util.Optional; 20 import java.util.Set; 21 import java.util.function.Function; 22 import java.util.regex.Matcher; 23 import java.util.regex.Pattern; 24 import java.util.stream.Stream; 25 26 import org.unicode.cldr.api.AttributeKey; 27 import org.unicode.cldr.api.CldrDataSupplier; 28 import org.unicode.cldr.api.CldrDataType; 29 import org.unicode.cldr.api.PathMatcher; 30 31 import com.google.common.base.Ascii; 32 import com.google.common.base.Splitter; 33 import com.google.common.base.Strings; 34 import com.google.common.collect.HashBasedTable; 35 import com.google.common.collect.ImmutableMap; 36 import com.google.common.collect.ImmutableSet; 37 import com.google.common.collect.ImmutableTable; 38 import com.google.common.collect.Sets; 39 import com.google.common.collect.Table; 40 41 /** 42 * Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic 43 * in some mapper classes. 44 * 45 * When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is 46 * build using the same underlying CLDR data. The only reason mapper classes do not create their 47 * own instances directly is the relative cost of processing all the supplemental data each time. 48 */ 49 // TODO: This should be moved into the API and leverage some of the existing utility functions. 50 public final class SupplementalData { 51 // Special IDs which are not supported via CLDR, but for which synthetic data is injected. 52 // The "TRADITIONAL" variants are here because their calendar differs from the non-variant 53 // locale. However CLDR cannot represent this currently because calendar defaults are in 54 // supplemental data (rather than locale data) and are keyed only on territory. 55 private static final ImmutableSet<String> PHANTOM_LOCALE_IDS = 56 ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL"); 57 58 private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}"); 59 60 private static final PathMatcher ALIAS = 61 PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]"); 62 63 private static final PathMatcher PARENT_LOCALE = 64 PathMatcher.of("//supplementalData/parentLocales/parentLocale[@parent=*]"); 65 private static final AttributeKey PARENT = keyOf("parentLocale", "parent"); 66 private static final AttributeKey LOCALES = keyOf("parentLocale", "locales"); 67 68 private static final PathMatcher CALENDER_PREFERENCE = 69 PathMatcher.of("//supplementalData/calendarPreferenceData/calendarPreference[@territories=*]"); 70 private static final AttributeKey CALENDER_TERRITORIES = 71 keyOf("calendarPreference", "territories"); 72 private static final AttributeKey CALENDER_ORDERING = 73 keyOf("calendarPreference", "ordering"); 74 75 private static final PathMatcher LIKELY_SUBTAGS = 76 PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]"); 77 private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from"); 78 private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to"); 79 80 private static final Splitter LIST_SPLITTER = 81 Splitter.on(whitespace()).omitEmptyStrings(); 82 83 // Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than 84 // a single value (it's structurally always a list, but only territory aliases have a need for 85 // more than one value). 86 private enum Alias { 87 LANGUAGE, SCRIPT, TERRITORY; 88 89 private static final ImmutableMap<String, Alias> TYPE_MAP = 90 Arrays.stream(values()) 91 .collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity())); 92 93 private final String elementName = Ascii.toLowerCase(name()) + "Alias"; 94 final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type"); 95 final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement"); 96 forElementName(String name)97 static Optional<Alias> forElementName(String name) { 98 return Optional.ofNullable(TYPE_MAP.get(name)); 99 } 100 } 101 102 /** 103 * Creates a supplemental data API instance from the given CLDR data supplier. 104 * 105 * @param src the CLDR data supplier. 106 * @return the supplemental data API. 107 */ create(CldrDataSupplier src)108 public static SupplementalData create(CldrDataSupplier src) { 109 Table<Alias, String, String> aliasTable = HashBasedTable.create(); 110 Map<String, String> parentLocaleMap = new HashMap<>(); 111 Map<String, String> defaultCalendarMap = new HashMap<>(); 112 Map<String, String> likelySubtagMap = new HashMap<>(); 113 114 src.getDataForType(CldrDataType.SUPPLEMENTAL).accept( 115 ARBITRARY, 116 v -> { 117 if (ALIAS.matches(v.getPath())) { 118 // Territory alias replacements can be a list of values (e.g. when countries 119 // break up). We use the first (geo-politically most significant) value. This 120 // doesn't happen for languages or scripts, but could in theory. 121 Alias.forElementName(v.getPath().getName()).ifPresent( 122 alias -> aliasTable.put( 123 alias, 124 alias.typeKey.valueFrom(v), 125 alias.replacementKey.valueFrom(v))); 126 } else if (PARENT_LOCALE.matches(v.getPath())) { 127 String p = PARENT.valueFrom(v); 128 LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p)); 129 } else if (CALENDER_PREFERENCE.matches(v.getPath())) { 130 String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0); 131 CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c)); 132 } else if (LIKELY_SUBTAGS.matches(v.getPath())) { 133 likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v)); 134 } 135 }); 136 137 Set<String> availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS); 138 return new SupplementalData( 139 availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap); 140 } 141 142 // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU 143 // data generation. Because this is mutable, it is thoroughly unsuitable for general use. 144 private static final class LocaleId { 145 // From: https://unicode.org/reports/tr35/#Identifiers 146 // Locale ID is: 147 // (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)* 148 // 149 // However in CLDR data, there's always a language (even if it's "und"), and never more 150 // than one variant, so this can be simplified to: 151 // <language>(_<script>)?(_<region>)?(_<variant>)? 152 // 153 // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw"). 154 // Note that the specification allows for languages 5-8 characters long, but in reality 155 // this has never occurred yet, so it's ignored in this code. 156 // 157 // * Script is 4-letter Xxxx script identifier (e.g. "Latn"). 158 // The specification permits any casing for script subtags, but since all the data uses 159 // the capitalized "Xxxx" form, that's what this code expects. 160 // 161 // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric 162 // identifier (e.g. "001"). 163 // 164 // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting 165 // with a digit (this avoids any ambiguity with script subtags). However because ICU 166 // violates this rule by using "TRADITIONAL" (11-letters) the length restriction is 167 // merely "longer than 5". 168 // 169 // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows 170 // for either '-' or '_'). 171 // 172 // The regex for unambiguously capturing the parts of a locale ID from the CLDR data is: 173 private static final Pattern LOCALE_ID = 174 Pattern.compile("([a-z]{2,3})" 175 + "(?:_([A-Z][a-z]{3}))?" 176 + "(?:_([A-Z]{2}|[0-9]{3}))?" 177 + "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?"); 178 parse(String localeId)179 static LocaleId parse(String localeId) { 180 Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null")); 181 checkArgument(m.matches(), "invalid locale ID: %s", localeId); 182 return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4)); 183 } 184 of(String language, String script, String region)185 static LocaleId of(String language, String script, String region) { 186 return new LocaleId().setLanguage(language).setScript(script).setRegion(region); 187 } 188 189 // Only the language subtag is non-nullable. 190 private String languageSubtag; 191 private String scriptSubtag; 192 private String regionSubtag; 193 private String variantSubtag; 194 getLanguage()195 String getLanguage() { 196 return languageSubtag; 197 } 198 getScript()199 String getScript() { 200 return scriptSubtag; 201 } 202 getRegion()203 String getRegion() { 204 return regionSubtag; 205 } 206 getVariant()207 String getVariant() { 208 return variantSubtag; 209 } 210 setLanguage(String languageSubtag)211 LocaleId setLanguage(String languageSubtag) { 212 checkNotNull(languageSubtag, "language subtag must not be null"); 213 checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty"); 214 this.languageSubtag = languageSubtag; 215 return this; 216 } 217 setScript(String scriptSubtag)218 LocaleId setScript(String scriptSubtag) { 219 this.scriptSubtag = Strings.emptyToNull(scriptSubtag); 220 return this; 221 } 222 setRegion(String regionSubtag)223 LocaleId setRegion(String regionSubtag) { 224 this.regionSubtag = Strings.emptyToNull(regionSubtag); 225 return this; 226 } 227 setVariant(String variantSubtag)228 LocaleId setVariant(String variantSubtag) { 229 this.variantSubtag = Strings.emptyToNull(variantSubtag); 230 return this; 231 } 232 toString()233 @Override public String toString() { 234 StringBuilder id = new StringBuilder(languageSubtag); 235 if (scriptSubtag != null) { 236 id.append("_").append(scriptSubtag); 237 } 238 if (regionSubtag != null) { 239 id.append("_").append(regionSubtag); 240 } 241 if (variantSubtag != null) { 242 id.append("_").append(variantSubtag); 243 } 244 return id.toString(); 245 } 246 equals(Object o)247 @Override public boolean equals(Object o) { 248 if (!(o instanceof LocaleId)) { 249 return false; 250 } 251 LocaleId other = (LocaleId) o; 252 return Objects.equals(languageSubtag, other.languageSubtag) 253 && Objects.equals(scriptSubtag, other.scriptSubtag) 254 && Objects.equals(regionSubtag, other.regionSubtag) 255 && Objects.equals(variantSubtag, other.variantSubtag); 256 } 257 hashCode()258 @Override public int hashCode() { 259 return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag); 260 } 261 } 262 263 private final ImmutableSet<String> availableIds; 264 private final ImmutableTable<Alias, String, String> aliasTable; 265 private final ImmutableMap<String, String> parentLocaleMap; 266 private final ImmutableMap<String, String> defaultCalendarMap; 267 private final ImmutableMap<String, String> likelySubtagMap; 268 SupplementalData( Set<String> availableIds, Table<Alias, String, String> aliasTable, Map<String, String> parentLocaleMap, Map<String, String> defaultCalendarMap, Map<String, String> likelySubtagMap)269 private SupplementalData( 270 Set<String> availableIds, 271 Table<Alias, String, String> aliasTable, 272 Map<String, String> parentLocaleMap, 273 Map<String, String> defaultCalendarMap, 274 Map<String, String> likelySubtagMap) { 275 276 this.availableIds = ImmutableSet.copyOf(availableIds); 277 this.aliasTable = ImmutableTable.copyOf(aliasTable); 278 this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap); 279 this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap); 280 this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap); 281 } 282 getAvailableLocaleIds()283 public ImmutableSet<String> getAvailableLocaleIds() { 284 return availableIds; 285 } 286 287 /** 288 * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible. 289 */ maximize(String localeId)290 public Optional<String> maximize(String localeId) { 291 return addLikelySubtags(localeId).map(Object::toString); 292 } 293 294 /** 295 * Returns the locale ID with any deprecated elements replaced. This is an 296 * implementation of the algorithm specified in 297 * <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML 298 * specification</a> but without any "minimizing" of the final result (as happens for 299 * canonicalization in the CLDR tools). 300 */ replaceDeprecatedTags(String localeId)301 public String replaceDeprecatedTags(String localeId) { 302 if (localeId.equals("root")) { 303 return localeId; 304 } 305 LocaleId id = LocaleId.parse(localeId); 306 307 // ---- LDML Specification ---- 308 // If the region subtag matches the type attribute of a territoryAlias element in 309 // Supplemental Data, replace the region subtag with the replacement value, as follows: 310 // 311 // * If there is a single territory in the replacement, use it. 312 // * If there are multiple territories: 313 // * Look up the most likely territory for the base language code (and script, if there 314 // is one). 315 // * If that likely territory is in the list, use it. 316 // * Otherwise, use the first territory in the list. 317 // ---- 318 // However there is a footnote that says: 319 // Formally, replacement of multiple territories uses Section 4.3 Likely Subtags. 320 // However, there are a small number of cases of multiple territories, so the mappings 321 // can be precomputed. This results in a faster lookup with a very small subset of the 322 // likely subtags data. 323 // 324 // Note that (contrary to the order implied by the LDML specification) this step is 325 // performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to 326 // work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to 327 // "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out. 328 // 329 // TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing? 330 if (id.getRegion() != null) { 331 String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion()); 332 if (replacementRegions != null) { 333 List<String> regions = LIST_SPLITTER.splitToList(replacementRegions); 334 checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId); 335 if (regions.size() == 1) { 336 id.setRegion(regions.get(0)); 337 } else { 338 LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null); 339 String likelyId = likelySubtagMap.get(key.toString()); 340 if (likelyId == null) { 341 likelyId = likelySubtagMap.get(key.setScript(null).toString()); 342 } 343 String likelyRegion = 344 likelyId != null ? LocaleId.parse(likelyId).getRegion() : null; 345 if (regions.contains(likelyRegion)) { 346 id.setRegion(likelyRegion); 347 } else { 348 id.setRegion(regions.get(0)); 349 } 350 } 351 } 352 } 353 354 // While it's not mentioned in the LDML specification, there is data in the alias table for 355 // replacement scripts (currently it contains exactly one entry with one value). Because 356 // its not clear if this is intended to only be single values or a list (and how to handle 357 // it if it were a list), there's a hard check to ensure it's only ever a single value. 358 if (id.getScript() != null) { 359 String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript()); 360 if (replacementScript != null) { 361 checkArgument(whitespace().matchesNoneOf(replacementScript), 362 "unexpected list of replacement scripts: %s", replacementScript); 363 id.setScript(replacementScript); 364 } 365 } 366 367 // ---- LDML Specification ---- 368 // If the language subtag matches the type attribute of a languageAlias element in 369 // Supplemental Data, replace the language subtag with the replacement value. 370 // 371 // If there are additional subtags in the replacement value, add them to the result, but 372 // only if there is no corresponding subtag already in the tag. 373 // ---- 374 // Contrary to the precise wording of the specification, we don't just check the language 375 // subtag, since language aliases can contain script and even region information. Instead 376 // we check the alias table using the same order as defined in subtag maximizing: 377 // 378 // <language>_<script>_<region> 379 // <language>_<region> 380 // <language>_<script> 381 // <language> 382 // 383 // There is no need to check for "und" however since that's not aliased anything, but since 384 // it shares the same code it's harmless to do. 385 resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s)) 386 .ifPresent(resolvedId -> { 387 id.setLanguage(checkNotNull(resolvedId.getLanguage(), 388 "missing language subtag in language alias: %s", resolvedId)); 389 if (id.getScript() == null) { 390 id.setScript(resolvedId.getScript()); 391 } 392 if (id.getRegion() == null) { 393 id.setRegion(resolvedId.getRegion()); 394 } 395 if (id.getVariant() == null) { 396 id.setVariant(resolvedId.getVariant()); 397 } 398 }); 399 return id.toString(); 400 } 401 402 /** 403 * Returns a suitable default calendar for a given locale if it's different from the default 404 * calendar inferred by the locale's parent. 405 * 406 * <p>Note that since the default calendar data is keyed from territory (region subtag) rather 407 * than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact 408 * that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently 409 * handled with hard-code special casing, but should probably be data driven eventually. 410 */ getDefaultCalendar(String localeId)411 public Optional<String> getDefaultCalendar(String localeId) { 412 Optional<String> calendar = getSpecialCaseCalendar(localeId); 413 if (calendar.isPresent()) { 414 return calendar; 415 } 416 String t = territoryOf(localeId); 417 calendar = Optional.ofNullable(defaultCalendarMap.get(t)); 418 if (!calendar.isPresent()) { 419 return Optional.empty(); 420 } 421 String rootCalendar = defaultCalendarMap.get("001"); 422 checkState(!rootCalendar.isEmpty(), "missing root calendar"); 423 if (localeId.equals("root")) { 424 return Optional.of(rootCalendar); 425 } 426 // All locales reach "root" eventually, and that maps to territory "001" which 427 // we already know has a value, so this loop *must* exit. 428 String parentCalendar; 429 do { 430 localeId = getParent(localeId); 431 String territory = territoryOf(localeId); 432 parentCalendar = defaultCalendarMap.get(territory); 433 } while (parentCalendar == null); 434 return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar; 435 } 436 437 // Hack to work around the limitation that CLDR data cannot represent default calendars that 438 // change because of non-territory information. Since this is limited to exactly two cases at 439 // the moment, and is unlikely to be expanded, it's being done directly in code. getSpecialCaseCalendar(String localeId)440 private Optional<String> getSpecialCaseCalendar(String localeId) { 441 Optional<String> maximized = maximize(localeId); 442 if (maximized.isPresent()) { 443 switch (maximized.get()) { 444 case "ja_Jpan_JP_TRADITIONAL": 445 return Optional.of("japanese"); 446 case "th_Thai_TH_TRADITIONAL": 447 return Optional.of("buddhist"); 448 } 449 } 450 return Optional.empty(); 451 } 452 453 /** 454 * Returns the parent of a non-root locale ID. This is more complex than simple truncation for 455 * two reasons: 456 * <ul> 457 * <li>There may be an explicit parent locale ID specified in the CLDR data. 458 * <li>Removal of non-default script subtags makes the parent locale "root" (unless there 459 * was an explicit parent specified). 460 * </ul> 461 * Note that all valid locale ID parent "chains" must end up at "root" eventually. 462 * 463 * For example (showing parent "chains"): 464 * <ul> 465 * <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root} 466 * <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root} 467 * </ul> 468 * 469 * @throws IllegalArgumentException if the given locale ID is invalid or "root". 470 */ getParent(String localeId)471 public String getParent(String localeId) { 472 checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale"); 473 // We probably want to fully canonicalize here. But in the absence of that we 474 // at least need to do the following canonicalization: 475 if (localeId.equals("no_NO_NY")) { 476 localeId = "nn_NO"; 477 } 478 // Always defer to an explicit parent locale set in the CLDR data. 479 Optional<String> explicitParent = getExplicitParentLocaleOf(localeId); 480 if (explicitParent.isPresent()) { 481 return explicitParent.get(); 482 } 483 // Now look for the start of the last ID "part" in order to truncate. 484 int lastPartSeperatorIndex = localeId.lastIndexOf('_'); 485 // The parent of a base language ID (e.g. "en" or "fr") is always "root". 486 if (lastPartSeperatorIndex == -1) { 487 return "root"; 488 } 489 String parentId = localeId.substring(0, lastPartSeperatorIndex); 490 491 // However, if the script of the locale is what's being truncated and it's NOT the default 492 // script for the language, return "root" as the parent rather than truncating. 493 String lastPart = localeId.substring(lastPartSeperatorIndex + 1); 494 if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) { 495 return "root"; 496 } 497 return !parentId.isEmpty() ? parentId : "root"; 498 } 499 500 /** 501 * Returns the explicit parent of a locale ID if specified in the CLDR data. 502 * 503 * Note that this method will not return a value for most locale IDs, since they do not have 504 * an explicit parent set. If you just want "normal" parent of a locale ID, use {@link 505 * #getParent(String)}. 506 */ getExplicitParentLocaleOf(String localeId)507 public Optional<String> getExplicitParentLocaleOf(String localeId) { 508 return Optional.ofNullable(parentLocaleMap.get(localeId)); 509 } 510 territoryOf(String localeId)511 private String territoryOf(String localeId) { 512 return localeId.equals("root") 513 ? "001" 514 : addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ"); 515 } 516 scriptOf(String localeId)517 private String scriptOf(String localeId) { 518 return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz"); 519 } 520 521 // From: https://unicode.org/reports/tr35/#Likely_Subtags 522 // 523 // Add Likely Subtags 524 // ------------------ 525 // Given a source locale X, to return a locale Y where the empty subtags have been filled in 526 // by the most likely subtags. A subtag is called empty if it is a missing script or region 527 // subtag, or it is a base language subtag with the value "und". 528 // 529 // Canonicalize 530 // ------------ 531 // Make sure the input locale is in canonical form ... 532 // ... 533 // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur. 534 // 535 // Note that this implementation does not need to handle 536 // legacy language tags (marked as “Type: grandfathered” in BCP 47). addLikelySubtags(String localeId)537 private Optional<LocaleId> addLikelySubtags(String localeId) { 538 if (localeId.equals("root")) { 539 return Optional.empty(); 540 } 541 542 LocaleId id = LocaleId.parse(localeId); 543 // ---- LDML Specification ---- 544 // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur. 545 if ("Zzzz".equals(id.getScript())) { 546 id.setScript(null); 547 } 548 if ("ZZ".equals(id.getRegion())) { 549 id.setRegion(null); 550 } 551 // ---- LDML Specification ---- 552 // A subtag is called empty if it is a missing script or region subtag, or it is a base 553 // language subtag with the value "und" 554 if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) { 555 // We are already canonical, so just return. 556 return Optional.of(id); 557 } 558 Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get); 559 if (!optTags.isPresent()) { 560 return Optional.empty(); 561 } 562 LocaleId subtags = optTags.get(); 563 checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags); 564 // Replace "missing" elements in the original ID with likely subtags. 565 if (id.getLanguage().equals("und")) { 566 id.setLanguage(subtags.getLanguage()); 567 } 568 if (id.getScript() == null) { 569 id.setScript(checkNotNull(subtags.getScript())); 570 } 571 if (id.getRegion() == null) { 572 id.setRegion(checkNotNull(subtags.getRegion())); 573 } 574 // Language is not "und" and both script and region subtags are set! 575 return Optional.of(id); 576 } 577 578 // From: https://unicode.org/reports/tr35/#Likely_Subtags 579 // 580 // Lookup 581 // ------ 582 // Lookup each of the following in order, and stop on the first match: 583 // <language>_<script>_<region> 584 // <language>_<region> 585 // <language>_<script> 586 // <language> 587 // "und"_<script> resolveLocaleId(LocaleId id, Function<String, String> fn)588 private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) { 589 String lang = id.getLanguage(); 590 String script = id.getScript(); 591 String region = id.getRegion(); 592 Stream<LocaleId> candidateIds = Stream.of( 593 LocaleId.of(lang, script, region), 594 LocaleId.of(lang, null, region), 595 LocaleId.of(lang, script, null), 596 LocaleId.of(lang, null, null)); 597 // Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on 598 // its own ("en_Latn_US") which is not intended. 599 if (script != null) { 600 candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null))); 601 } 602 return candidateIds 603 // Remove duplicate IDs (keeps the first one encountered). 604 .distinct() 605 .map(Object::toString) 606 .map(fn) 607 .filter(Objects::nonNull) 608 .findFirst() 609 .map(LocaleId::parse); 610 } 611 } 612