• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.api;
2 
3 import static com.google.common.base.Preconditions.checkArgument;
4 
5 import java.util.regex.Pattern;
6 
7 /**
8  * Utility methods for working with locale IDs as strings. This could, with a little thought, be
9  * made public if necessary.
10  */
11 final class LocaleIds {
12     // From: https://unicode.org/reports/tr35/#Identifiers
13     // Locale ID is:
14     //   (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
15     //
16     // However in CLDR data, there's always a language (even if it's "und"), and never more
17     // than one variant, so this can be simplified to:
18     //   <language>(_<script>)?(_<region>)?(_<variant>)?
19     //
20     // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
21     //   Note that the specification allows for languages 5-8 characters long, but in reality
22     //   this has never occurred yet, so it's ignored in this code.
23     //
24     // * Script is 4-letter Xxxx script identifier (e.g. "Latn").
25     //   The specification permits any casing for script subtags, but since all the data uses
26     //   the capitalized "Xxxx" form, that's what this code expects.
27     //
28     // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
29     //   identifier (e.g. "001").
30     //
31     // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
32     //   with a digit (this avoids any ambiguity with script subtags). However because ICU
33     //   violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
34     //   merely "longer than 5".
35     //
36     // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
37     // for either '-' or '_').
38     //
39     // The regex for unambiguously matching a valid locale ID (other than "root") for CLDR data is:
40     private static final Pattern LOCALE_ID =
41             Pattern.compile(
42                     "(?:[a-z]{2,3})"
43                             + "(?:_(?:[A-Z][a-z]{3}))?"
44                             + "(?:_(?:[A-Z]{2}|[0-9]{3}))?"
45                             + "(?:_(?:[A-Za-z]{5,}|[0-9][A-Za-z0-9]{3}))?");
46 
47     /**
48      * Checks whether the given ID is valid for CLDR use (including case). Locale IDs for use in
49      * CLDR APIs are a subset of all possible locale IDs and, unlike general locale IDs, they are
50      * case sensitive. The rules are:
51      *
52      * <ul>
53      *   <li>A locale ID is up to four subtags {@code
54      *       <language>(_<script>)?(_<region>)?(_<variant>)?}
55      *   <li>The allowed subtag separator is only ASCII underscore (not hyphen).
56      *   <li>The language subtag must exist (though it is permitted to be {@code "und"}).
57      *   <li>All other subtags are optional and are separated by a single underscore.
58      *   <li>Language subtag is lower-case, and is either 2 or 3 letters (i.e. "[a-z]{2,3}").
59      *   <li>Script subtag is mixed-case and must match {@code "[A-Z][a-z]{3}"}.
60      *   <li>Region subtag is upper-case and must match {@code "[A-Z]{2}} or {@code "[0-9]{3}"}.
61      *   <li>Variant subtag is upper- or lower-case and must match {@code "[A-Z]{5,}} or {@code
62      *       "[0-9][A-Z0-9]{3}"}. Note: The EBNF at
63      *       https://www.unicode.org/reports/tr35/#unicode_variant_subtag allows either lettercase,
64      *       and the data at common/validity/variant.xml user lower. CLDR 40 has be_tarask,
65      *       ca_ES_VALENCIA, en_US_POSIX.
66      *   <li>The special locale ID {@code "root"} is also permitted.
67      *       <ul>
68      *         <p>Note that this check does don't enforce validity in terms of checking for
69      *         deprecated languages, regions or script, so things like {@code "sh_YU"} (deprecated
70      *         language and/or region) are accepted.
71      *         <p>Examples of valid locale IDs are {@code "en"}, {@code "zh_Hant"}, {@code "fr_CA"},
72      *         {@code "sr_Latn_RS"} and {@code "ja_JP_TRADITIONAL"}.
73      *         <p>Examples of invalid locale IDs are {@code ""}, {@code "en_"}, {@code "Latn"} and
74      *         {@code "de__TRADITIONAL"}.
75      *
76      * @param localeId the ID the check.
77      * @throws IllegalArgumentException is the ID is invalid.
78      */
checkCldrLocaleId(String localeId)79     public static void checkCldrLocaleId(String localeId) {
80         // This check runs on a lot of locales, so make it as minimal as possible. If normalization
81         // is ever needed, do it in a separate method.
82         checkArgument(
83                 LOCALE_ID.matcher(localeId).matches() || localeId.equals("root"),
84                 "bad locale ID: %s",
85                 localeId);
86     }
87 }
88