• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu;
4 
5 import static com.google.common.base.CharMatcher.whitespace;
6 import static com.google.common.base.Preconditions.checkArgument;
7 import static com.google.common.base.Preconditions.checkNotNull;
8 import static com.google.common.base.Preconditions.checkState;
9 import static com.google.common.collect.ImmutableMap.toImmutableMap;
10 import static java.util.function.Function.identity;
11 import static org.unicode.cldr.api.AttributeKey.keyOf;
12 import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
13 
14 import java.util.Arrays;
15 import java.util.HashMap;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.Objects;
19 import java.util.Optional;
20 import java.util.Set;
21 import java.util.function.Function;
22 import java.util.regex.Matcher;
23 import java.util.regex.Pattern;
24 import java.util.stream.Stream;
25 
26 import org.unicode.cldr.api.AttributeKey;
27 import org.unicode.cldr.api.CldrDataSupplier;
28 import org.unicode.cldr.api.CldrDataType;
29 import org.unicode.cldr.api.PathMatcher;
30 
31 import com.google.common.base.Ascii;
32 import com.google.common.base.Splitter;
33 import com.google.common.base.Strings;
34 import com.google.common.collect.HashBasedTable;
35 import com.google.common.collect.ImmutableMap;
36 import com.google.common.collect.ImmutableSet;
37 import com.google.common.collect.ImmutableTable;
38 import com.google.common.collect.Sets;
39 import com.google.common.collect.Table;
40 
41 /**
42  * Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic
43  * in some mapper classes.
44  *
45  * When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is
46  * build using the same underlying CLDR data. The only reason mapper classes do not create their
47  * own instances directly is the relative cost of processing all the supplemental data each time.
48  */
49 // TODO: This should be moved into the API and leverage some of the existing utility functions.
50 public final class SupplementalData {
51     // Special IDs which are not supported via CLDR, but for which synthetic data is injected.
52     // The "TRADITIONAL" variants are here because their calendar differs from the non-variant
53     // locale. However CLDR cannot represent this currently because calendar defaults are in
54     // supplemental data (rather than locale data) and are keyed only on territory.
55     private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
56         ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
57 
58     private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
59 
60     private static final PathMatcher ALIAS =
61         PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]");
62 
63     private static final PathMatcher PARENT_LOCALE =
64         PathMatcher.of("//supplementalData/parentLocales/parentLocale[@parent=*]");
65     private static final AttributeKey PARENT = keyOf("parentLocale", "parent");
66     private static final AttributeKey LOCALES = keyOf("parentLocale", "locales");
67 
68     private static final PathMatcher CALENDER_PREFERENCE =
69         PathMatcher.of("//supplementalData/calendarPreferenceData/calendarPreference[@territories=*]");
70     private static final AttributeKey CALENDER_TERRITORIES =
71         keyOf("calendarPreference", "territories");
72     private static final AttributeKey CALENDER_ORDERING =
73         keyOf("calendarPreference", "ordering");
74 
75     private static final PathMatcher LIKELY_SUBTAGS =
76         PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]");
77     private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from");
78     private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to");
79 
80     private static final Splitter LIST_SPLITTER =
81         Splitter.on(whitespace()).omitEmptyStrings();
82 
83     // Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than
84     // a single value (it's structurally always a list, but only territory aliases have a need for
85     // more than one value).
86     private enum Alias {
87         LANGUAGE, SCRIPT, TERRITORY;
88 
89         private static final ImmutableMap<String, Alias> TYPE_MAP =
90             Arrays.stream(values())
91                 .collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity()));
92 
93         private final String elementName = Ascii.toLowerCase(name()) + "Alias";
94         final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type");
95         final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement");
96 
forElementName(String name)97         static Optional<Alias> forElementName(String name) {
98             return Optional.ofNullable(TYPE_MAP.get(name));
99         }
100     }
101 
102     /**
103      * Creates a supplemental data API instance from the given CLDR data supplier.
104      *
105      * @param src the CLDR data supplier.
106      * @return the supplemental data API.
107      */
create(CldrDataSupplier src)108     public static SupplementalData create(CldrDataSupplier src) {
109         Table<Alias, String, String> aliasTable = HashBasedTable.create();
110         Map<String, String> parentLocaleMap = new HashMap<>();
111         Map<String, String> defaultCalendarMap = new HashMap<>();
112         Map<String, String> likelySubtagMap = new HashMap<>();
113 
114         src.getDataForType(CldrDataType.SUPPLEMENTAL).accept(
115             ARBITRARY,
116             v -> {
117                 if (ALIAS.matches(v.getPath())) {
118                     // Territory alias replacements can be a list of values (e.g. when countries
119                     // break up). We use the first (geo-politically most significant) value. This
120                     // doesn't happen for languages or scripts, but could in theory.
121                     Alias.forElementName(v.getPath().getName()).ifPresent(
122                         alias -> aliasTable.put(
123                             alias,
124                             alias.typeKey.valueFrom(v),
125                             alias.replacementKey.valueFrom(v)));
126                 } else if (PARENT_LOCALE.matches(v.getPath())) {
127                     String p = PARENT.valueFrom(v);
128                     LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p));
129                 } else if (CALENDER_PREFERENCE.matches(v.getPath())) {
130                     String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0);
131                     CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c));
132                 } else if (LIKELY_SUBTAGS.matches(v.getPath())) {
133                     likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v));
134                 }
135             });
136 
137         Set<String> availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS);
138         return new SupplementalData(
139             availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
140     }
141 
142     // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
143     // data generation. Because this is mutable, it is thoroughly unsuitable for general use.
144     private static final class LocaleId {
145         // From: https://unicode.org/reports/tr35/#Identifiers
146         // Locale ID is:
147         //   (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
148         //
149         // However in CLDR data, there's always a language (even if it's "und"), and never more
150         // than one variant, so this can be simplified to:
151         //   <language>(_<script>)?(_<region>)?(_<variant>)?
152         //
153         // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
154         //   Note that the specification allows for languages 5-8 characters long, but in reality
155         //   this has never occurred yet, so it's ignored in this code.
156         //
157         // * Script is 4-letter Xxxx script identifier (e.g. "Latn").
158         //   The specification permits any casing for script subtags, but since all the data uses
159         //   the capitalized "Xxxx" form, that's what this code expects.
160         //
161         // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
162         //   identifier (e.g. "001").
163         //
164         // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
165         //   with a digit (this avoids any ambiguity with script subtags). However because ICU
166         //   violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
167         //   merely "longer than 5".
168         //
169         // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
170         // for either '-' or '_').
171         //
172         // The regex for unambiguously capturing the parts of a locale ID from the CLDR data is:
173         private static final Pattern LOCALE_ID =
174             Pattern.compile("([a-z]{2,3})"
175                 + "(?:_([A-Z][a-z]{3}))?"
176                 + "(?:_([A-Z]{2}|[0-9]{3}))?"
177                 + "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?");
178 
parse(String localeId)179         static LocaleId parse(String localeId) {
180             Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null"));
181             checkArgument(m.matches(), "invalid locale ID: %s", localeId);
182             return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4));
183         }
184 
of(String language, String script, String region)185         static LocaleId of(String language, String script, String region) {
186             return new LocaleId().setLanguage(language).setScript(script).setRegion(region);
187         }
188 
189         // Only the language subtag is non-nullable.
190         private String languageSubtag;
191         private String scriptSubtag;
192         private String regionSubtag;
193         private String variantSubtag;
194 
getLanguage()195         String getLanguage() {
196             return languageSubtag;
197         }
198 
getScript()199         String getScript() {
200             return scriptSubtag;
201         }
202 
getRegion()203         String getRegion() {
204             return regionSubtag;
205         }
206 
getVariant()207         String getVariant() {
208             return variantSubtag;
209         }
210 
setLanguage(String languageSubtag)211         LocaleId setLanguage(String languageSubtag) {
212             checkNotNull(languageSubtag, "language subtag must not be null");
213             checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty");
214             this.languageSubtag = languageSubtag;
215             return this;
216         }
217 
setScript(String scriptSubtag)218         LocaleId setScript(String scriptSubtag) {
219             this.scriptSubtag = Strings.emptyToNull(scriptSubtag);
220             return this;
221         }
222 
setRegion(String regionSubtag)223         LocaleId setRegion(String regionSubtag) {
224             this.regionSubtag = Strings.emptyToNull(regionSubtag);
225             return this;
226         }
227 
setVariant(String variantSubtag)228         LocaleId setVariant(String variantSubtag) {
229             this.variantSubtag = Strings.emptyToNull(variantSubtag);
230             return this;
231         }
232 
toString()233         @Override public String toString() {
234             StringBuilder id = new StringBuilder(languageSubtag);
235             if (scriptSubtag != null) {
236                 id.append("_").append(scriptSubtag);
237             }
238             if (regionSubtag != null) {
239                 id.append("_").append(regionSubtag);
240             }
241             if (variantSubtag != null) {
242                 id.append("_").append(variantSubtag);
243             }
244             return id.toString();
245         }
246 
equals(Object o)247         @Override public boolean equals(Object o) {
248             if (!(o instanceof LocaleId)) {
249                 return false;
250             }
251             LocaleId other = (LocaleId) o;
252             return Objects.equals(languageSubtag, other.languageSubtag)
253                 && Objects.equals(scriptSubtag, other.scriptSubtag)
254                 && Objects.equals(regionSubtag, other.regionSubtag)
255                 && Objects.equals(variantSubtag, other.variantSubtag);
256         }
257 
hashCode()258         @Override public int hashCode() {
259             return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag);
260         }
261     }
262 
263     private final ImmutableSet<String> availableIds;
264     private final ImmutableTable<Alias, String, String> aliasTable;
265     private final ImmutableMap<String, String> parentLocaleMap;
266     private final ImmutableMap<String, String> defaultCalendarMap;
267     private final ImmutableMap<String, String> likelySubtagMap;
268 
SupplementalData( Set<String> availableIds, Table<Alias, String, String> aliasTable, Map<String, String> parentLocaleMap, Map<String, String> defaultCalendarMap, Map<String, String> likelySubtagMap)269     private SupplementalData(
270         Set<String> availableIds,
271         Table<Alias, String, String> aliasTable,
272         Map<String, String> parentLocaleMap,
273         Map<String, String> defaultCalendarMap,
274         Map<String, String> likelySubtagMap) {
275 
276         this.availableIds = ImmutableSet.copyOf(availableIds);
277         this.aliasTable = ImmutableTable.copyOf(aliasTable);
278         this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
279         this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
280         this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
281     }
282 
getAvailableLocaleIds()283     public ImmutableSet<String> getAvailableLocaleIds() {
284         return availableIds;
285     }
286 
287     /**
288      * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
289      */
maximize(String localeId)290     public Optional<String> maximize(String localeId) {
291         return addLikelySubtags(localeId).map(Object::toString);
292     }
293 
294     /**
295      * Returns the locale ID with any deprecated elements replaced. This is an
296      * implementation of the algorithm specified in
297      * <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML
298      * specification</a> but without any "minimizing" of the final result (as happens for
299      * canonicalization in the CLDR tools).
300      */
replaceDeprecatedTags(String localeId)301     public String replaceDeprecatedTags(String localeId) {
302         if (localeId.equals("root")) {
303             return localeId;
304         }
305         LocaleId id = LocaleId.parse(localeId);
306 
307         // ---- LDML Specification ----
308         // If the region subtag matches the type attribute of a territoryAlias element in
309         // Supplemental Data, replace the region subtag with the replacement value, as follows:
310         //
311         // * If there is a single territory in the replacement, use it.
312         // * If there are multiple territories:
313         //   * Look up the most likely territory for the base language code (and script, if there
314         //     is one).
315         //   * If that likely territory is in the list, use it.
316         //   * Otherwise, use the first territory in the list.
317         // ----
318         // However there is a footnote that says:
319         //   Formally, replacement of multiple territories uses Section 4.3 Likely Subtags.
320         //   However, there are a small number of cases of multiple territories, so the mappings
321         //   can be precomputed. This results in a faster lookup with a very small subset of the
322         //   likely subtags data.
323         //
324         // Note that (contrary to the order implied by the LDML specification) this step is
325         // performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to
326         // work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to
327         // "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out.
328         //
329         // TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing?
330         if (id.getRegion() != null) {
331             String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion());
332             if (replacementRegions != null) {
333                 List<String> regions = LIST_SPLITTER.splitToList(replacementRegions);
334                 checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId);
335                 if (regions.size() == 1) {
336                     id.setRegion(regions.get(0));
337                 } else {
338                     LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null);
339                     String likelyId = likelySubtagMap.get(key.toString());
340                     if (likelyId == null) {
341                         likelyId = likelySubtagMap.get(key.setScript(null).toString());
342                     }
343                     String likelyRegion =
344                         likelyId != null ? LocaleId.parse(likelyId).getRegion() : null;
345                     if (regions.contains(likelyRegion)) {
346                         id.setRegion(likelyRegion);
347                     } else {
348                         id.setRegion(regions.get(0));
349                     }
350                 }
351             }
352         }
353 
354         // While it's not mentioned in the LDML specification, there is data in the alias table for
355         // replacement scripts (currently it contains exactly one entry with one value). Because
356         // its not clear if this is intended to only be single values or a list (and how to handle
357         // it if it were a list), there's a hard check to ensure it's only ever a single value.
358         if (id.getScript() != null) {
359             String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript());
360             if (replacementScript != null) {
361                 checkArgument(whitespace().matchesNoneOf(replacementScript),
362                     "unexpected list of replacement scripts: %s", replacementScript);
363                 id.setScript(replacementScript);
364             }
365         }
366 
367         // ---- LDML Specification ----
368         // If the language subtag matches the type attribute of a languageAlias element in
369         // Supplemental Data, replace the language subtag with the replacement value.
370         //
371         // If there are additional subtags in the replacement value, add them to the result, but
372         // only if there is no corresponding subtag already in the tag.
373         // ----
374         // Contrary to the precise wording of the specification, we don't just check the language
375         // subtag, since language aliases can contain script and even region information. Instead
376         // we check the alias table using the same order as defined in subtag maximizing:
377         //
378         // <language>_<script>_<region>
379         // <language>_<region>
380         // <language>_<script>
381         // <language>
382         //
383         // There is no need to check for "und" however since that's not aliased anything, but since
384         // it shares the same code it's harmless to do.
385         resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s))
386             .ifPresent(resolvedId -> {
387                 id.setLanguage(checkNotNull(resolvedId.getLanguage(),
388                      "missing language subtag in language alias: %s", resolvedId));
389                 if (id.getScript() == null) {
390                     id.setScript(resolvedId.getScript());
391                 }
392                 if (id.getRegion() == null) {
393                     id.setRegion(resolvedId.getRegion());
394                 }
395                 if (id.getVariant() == null) {
396                     id.setVariant(resolvedId.getVariant());
397                 }
398             });
399         return id.toString();
400     }
401 
402     /**
403      * Returns a suitable default calendar for a given locale if it's different from the default
404      * calendar inferred by the locale's parent.
405      *
406      * <p>Note that since the default calendar data is keyed from territory (region subtag) rather
407      * than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact
408      * that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently
409      * handled with hard-code special casing, but should probably be data driven eventually.
410      */
getDefaultCalendar(String localeId)411     public Optional<String> getDefaultCalendar(String localeId) {
412         Optional<String> calendar = getSpecialCaseCalendar(localeId);
413         if (calendar.isPresent()) {
414             return calendar;
415         }
416         String t = territoryOf(localeId);
417         calendar = Optional.ofNullable(defaultCalendarMap.get(t));
418         if (!calendar.isPresent()) {
419             return Optional.empty();
420         }
421         String rootCalendar = defaultCalendarMap.get("001");
422         checkState(!rootCalendar.isEmpty(), "missing root calendar");
423         if (localeId.equals("root")) {
424             return Optional.of(rootCalendar);
425         }
426         // All locales reach "root" eventually, and that maps to territory "001" which
427         // we already know has a value, so this loop *must* exit.
428         String parentCalendar;
429         do {
430             localeId = getParent(localeId);
431             String territory = territoryOf(localeId);
432             parentCalendar = defaultCalendarMap.get(territory);
433         } while (parentCalendar == null);
434         return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar;
435     }
436 
437     // Hack to work around the limitation that CLDR data cannot represent default calendars that
438     // change because of non-territory information. Since this is limited to exactly two cases at
439     // the moment, and is unlikely to be expanded, it's being done directly in code.
getSpecialCaseCalendar(String localeId)440     private Optional<String> getSpecialCaseCalendar(String localeId) {
441         Optional<String> maximized = maximize(localeId);
442         if (maximized.isPresent()) {
443             switch (maximized.get()) {
444             case "ja_Jpan_JP_TRADITIONAL":
445                 return Optional.of("japanese");
446             case "th_Thai_TH_TRADITIONAL":
447                 return Optional.of("buddhist");
448             }
449         }
450         return Optional.empty();
451     }
452 
453     /**
454      * Returns the parent of a non-root locale ID. This is more complex than simple truncation for
455      * two reasons:
456      * <ul>
457      *     <li>There may be an explicit parent locale ID specified in the CLDR data.
458      *     <li>Removal of non-default script subtags makes the parent locale "root" (unless there
459      *         was an explicit parent specified).
460      * </ul>
461      * Note that all valid locale ID parent "chains" must end up at "root" eventually.
462      *
463      * For example (showing parent "chains"):
464      * <ul>
465      *     <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root}
466      *     <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root}
467      * </ul>
468      *
469      * @throws IllegalArgumentException if the given locale ID is invalid or "root".
470      */
getParent(String localeId)471     public String getParent(String localeId) {
472         checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale");
473         // We probably want to fully canonicalize here. But in the absence of that we
474         // at least need to do the following canonicalization:
475         if (localeId.equals("no_NO_NY")) {
476             localeId = "nn_NO";
477         }
478         // Always defer to an explicit parent locale set in the CLDR data.
479         Optional<String> explicitParent = getExplicitParentLocaleOf(localeId);
480         if (explicitParent.isPresent()) {
481             return explicitParent.get();
482         }
483         // Now look for the start of the last ID "part" in order to truncate.
484         int lastPartSeperatorIndex = localeId.lastIndexOf('_');
485         // The parent of a base language ID (e.g. "en" or "fr") is always "root".
486         if (lastPartSeperatorIndex == -1) {
487             return "root";
488         }
489         String parentId = localeId.substring(0, lastPartSeperatorIndex);
490 
491         // However, if the script of the locale is what's being truncated and it's NOT the default
492         // script for the language, return "root" as the parent rather than truncating.
493         String lastPart = localeId.substring(lastPartSeperatorIndex + 1);
494         if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) {
495             return "root";
496         }
497         return !parentId.isEmpty() ? parentId : "root";
498     }
499 
500     /**
501      * Returns the explicit parent of a locale ID if specified in the CLDR data.
502      *
503      * Note that this method will not return a value for most locale IDs, since they do not have
504      * an explicit parent set. If you just want "normal" parent of a locale ID, use {@link
505      * #getParent(String)}.
506      */
getExplicitParentLocaleOf(String localeId)507     public Optional<String> getExplicitParentLocaleOf(String localeId) {
508         return Optional.ofNullable(parentLocaleMap.get(localeId));
509     }
510 
territoryOf(String localeId)511     private String territoryOf(String localeId) {
512         return localeId.equals("root")
513             ? "001"
514             : addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ");
515     }
516 
scriptOf(String localeId)517     private String scriptOf(String localeId) {
518         return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz");
519     }
520 
521     // From: https://unicode.org/reports/tr35/#Likely_Subtags
522     //
523     // Add Likely Subtags
524     // ------------------
525     // Given a source locale X, to return a locale Y where the empty subtags have been filled in
526     // by the most likely subtags. A subtag is called empty if it is a missing script or region
527     // subtag, or it is a base language subtag with the value "und".
528     //
529     // Canonicalize
530     // ------------
531     // Make sure the input locale is in canonical form ...
532     // ...
533     // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
534     //
535     // Note that this implementation does not need to handle
536     // legacy language tags (marked as “Type: grandfathered” in BCP 47).
addLikelySubtags(String localeId)537     private Optional<LocaleId> addLikelySubtags(String localeId) {
538         if (localeId.equals("root")) {
539             return Optional.empty();
540         }
541 
542         LocaleId id = LocaleId.parse(localeId);
543         // ---- LDML Specification ----
544         // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
545         if ("Zzzz".equals(id.getScript())) {
546             id.setScript(null);
547         }
548         if ("ZZ".equals(id.getRegion())) {
549             id.setRegion(null);
550         }
551         // ---- LDML Specification ----
552         // A subtag is called empty if it is a missing script or region subtag, or it is a base
553         // language subtag with the value "und"
554         if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) {
555             // We are already canonical, so just return.
556             return Optional.of(id);
557         }
558         Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get);
559         if (!optTags.isPresent()) {
560             return Optional.empty();
561         }
562         LocaleId subtags = optTags.get();
563         checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags);
564         // Replace "missing" elements in the original ID with likely subtags.
565         if (id.getLanguage().equals("und")) {
566             id.setLanguage(subtags.getLanguage());
567         }
568         if (id.getScript() == null) {
569             id.setScript(checkNotNull(subtags.getScript()));
570         }
571         if (id.getRegion() == null) {
572             id.setRegion(checkNotNull(subtags.getRegion()));
573         }
574         // Language is not "und" and both script and region subtags are set!
575         return Optional.of(id);
576     }
577 
578     // From: https://unicode.org/reports/tr35/#Likely_Subtags
579     //
580     // Lookup
581     // ------
582     // Lookup each of the following in order, and stop on the first match:
583     // <language>_<script>_<region>
584     // <language>_<region>
585     // <language>_<script>
586     // <language>
587     // "und"_<script>
resolveLocaleId(LocaleId id, Function<String, String> fn)588     private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) {
589         String lang = id.getLanguage();
590         String script = id.getScript();
591         String region = id.getRegion();
592         Stream<LocaleId> candidateIds = Stream.of(
593             LocaleId.of(lang, script, region),
594             LocaleId.of(lang, null, region),
595             LocaleId.of(lang, script, null),
596             LocaleId.of(lang, null, null));
597         // Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on
598         // its own ("en_Latn_US") which is not intended.
599         if (script != null) {
600             candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null)));
601         }
602         return candidateIds
603             // Remove duplicate IDs (keeps the first one encountered).
604             .distinct()
605             .map(Object::toString)
606             .map(fn)
607             .filter(Objects::nonNull)
608             .findFirst()
609             .map(LocaleId::parse);
610     }
611 }
612