• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.api;
2 
3 import static com.google.common.base.Preconditions.checkArgument;
4 
5 import java.util.regex.Pattern;
6 
7 /**
8  * Utility methods for working with locale IDs as strings. This could, with a little thought, be
9  * made public if necessary.
10  */
11 final class LocaleIds {
12     // From: https://unicode.org/reports/tr35/#Identifiers
13     // Locale ID is:
14     //   (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
15     //
16     // However in CLDR data, there's always a language (even if it's "und"), and never more
17     // than one variant, so this can be simplified to:
18     //   <language>(_<script>)?(_<region>)?(_<variant>)?
19     //
20     // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
21     //   Note that the specification allows for languages 5-8 characters long, but in reality
22     //   this has never occurred yet, so it's ignored in this code.
23     //
24     // * Script is 4-letter Xxxx script identifier (e.g. "Latn").
25     //   The specification permits any casing for script subtags, but since all the data uses
26     //   the capitalized "Xxxx" form, that's what this code expects.
27     //
28     // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
29     //   identifier (e.g. "001").
30     //
31     // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
32     //   with a digit (this avoids any ambiguity with script subtags). However because ICU
33     //   violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
34     //   merely "longer than 5".
35     //
36     // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
37     // for either '-' or '_').
38     //
39     // The regex for unambiguously matching a valid locale ID (other than "root") for CLDR data is:
40     private static final Pattern LOCALE_ID =
41         Pattern.compile("(?:[a-z]{2,3})"
42             + "(?:_(?:[A-Z][a-z]{3}))?"
43             + "(?:_(?:[A-Z]{2}|[0-9]{3}))?"
44             + "(?:_(?:[A-Z]{5,}|[0-9][A-Z0-9]{3}))?");
45 
46     /**
47      * Checks whether the given ID is valid for CLDR use (including case). Locale IDs for use in
48      * CLDR APIs are a subset of all possible locale IDs and, unlike general locale IDs, they
49      * are case sensitive. The rules are:
50      * <ul>
51      *     <li>A locale ID is up to four subtags {@code
52      *         <language>(_<script>)?(_<region>)?(_<variant>)?}
53      *     <li>The allowed subtag separator is only ASCII underscore (not hyphen).
54      *     <li>The language subtag must exist (though it is permitted to be {@code "und"}).
55      *     <li>All other subtags are optional and are separated by a single underscore.
56      *     <li>Language subtag is lower-case, and is either 2 or 3 letters (i.e. "[a-z]{2,3}").
57      *     <li>Script subtag is mixed-case and must match {@code "[A-Z][a-z]{3}"}.
58      *     <li>Region subtag is upper-case and must match {@code "[A-Z]{2}} or {@code "[0-9]{3}"}.
59      *     <li>Variant subtag is upper-case and must match {@code "[A-Z]{5,}} or
60      *         {@code "[0-9][A-Z0-9]{3}"}.
61      *     <li>The special locale ID {@code "root"} is also permitted.
62      * <ul>
63      *
64      * <p>Note that this check does don't enforce validity in terms of checking for deprecated
65      * languages, regions or script, so things like {@code "sh_YU"} (deprecated language and/or
66      * region) are accepted.
67      *
68      * <p>Examples of valid locale IDs are {@code "en"}, {@code "zh_Hant"}, {@code "fr_CA"},
69      * {@code "sr_Latn_RS"} and {@code "ja_JP_TRADITIONAL"}.
70      *
71      * <p>Examples of invalid locale IDs are {@code ""}, {@code "en_"}, {@code "Latn"} and
72      * {@code "de__TRADITIONAL"}.
73      *
74      * @param localeId the ID the check.
75      * @throws IllegalArgumentException is the ID is invalid.
76      */
checkCldrLocaleId(String localeId)77     public static void checkCldrLocaleId(String localeId) {
78         // This check runs on a lot of locales, so make it as minimal as possible. If normalization
79         // is ever needed, do it in a separate method.
80         checkArgument(LOCALE_ID.matcher(localeId).matches() || localeId.equals("root"),
81             "bad locale ID: %s", localeId);
82     }
83 
84 }
85