1 package org.unicode.cldr.api; 2 3 import static com.google.common.base.Preconditions.checkArgument; 4 5 import java.util.regex.Pattern; 6 7 /** 8 * Utility methods for working with locale IDs as strings. This could, with a little thought, be 9 * made public if necessary. 10 */ 11 final class LocaleIds { 12 // From: https://unicode.org/reports/tr35/#Identifiers 13 // Locale ID is: 14 // (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)* 15 // 16 // However in CLDR data, there's always a language (even if it's "und"), and never more 17 // than one variant, so this can be simplified to: 18 // <language>(_<script>)?(_<region>)?(_<variant>)? 19 // 20 // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw"). 21 // Note that the specification allows for languages 5-8 characters long, but in reality 22 // this has never occurred yet, so it's ignored in this code. 23 // 24 // * Script is 4-letter Xxxx script identifier (e.g. "Latn"). 25 // The specification permits any casing for script subtags, but since all the data uses 26 // the capitalized "Xxxx" form, that's what this code expects. 27 // 28 // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric 29 // identifier (e.g. "001"). 30 // 31 // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting 32 // with a digit (this avoids any ambiguity with script subtags). However because ICU 33 // violates this rule by using "TRADITIONAL" (11-letters) the length restriction is 34 // merely "longer than 5". 35 // 36 // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows 37 // for either '-' or '_'). 38 // 39 // The regex for unambiguously matching a valid locale ID (other than "root") for CLDR data is: 40 private static final Pattern LOCALE_ID = 41 Pattern.compile( 42 "(?:[a-z]{2,3})" 43 + "(?:_(?:[A-Z][a-z]{3}))?" 44 + "(?:_(?:[A-Z]{2}|[0-9]{3}))?" 45 + "(?:_(?:[A-Za-z]{5,}|[0-9][A-Za-z0-9]{3}))?"); 46 47 /** 48 * Checks whether the given ID is valid for CLDR use (including case). Locale IDs for use in 49 * CLDR APIs are a subset of all possible locale IDs and, unlike general locale IDs, they are 50 * case sensitive. The rules are: 51 * 52 * <ul> 53 * <li>A locale ID is up to four subtags {@code 54 * <language>(_<script>)?(_<region>)?(_<variant>)?} 55 * <li>The allowed subtag separator is only ASCII underscore (not hyphen). 56 * <li>The language subtag must exist (though it is permitted to be {@code "und"}). 57 * <li>All other subtags are optional and are separated by a single underscore. 58 * <li>Language subtag is lower-case, and is either 2 or 3 letters (i.e. "[a-z]{2,3}"). 59 * <li>Script subtag is mixed-case and must match {@code "[A-Z][a-z]{3}"}. 60 * <li>Region subtag is upper-case and must match {@code "[A-Z]{2}} or {@code "[0-9]{3}"}. 61 * <li>Variant subtag is upper- or lower-case and must match {@code "[A-Z]{5,}} or {@code 62 * "[0-9][A-Z0-9]{3}"}. Note: The EBNF at 63 * https://www.unicode.org/reports/tr35/#unicode_variant_subtag allows either lettercase, 64 * and the data at common/validity/variant.xml user lower. CLDR 40 has be_tarask, 65 * ca_ES_VALENCIA, en_US_POSIX. 66 * <li>The special locale ID {@code "root"} is also permitted. 67 * <ul> 68 * <p>Note that this check does don't enforce validity in terms of checking for 69 * deprecated languages, regions or script, so things like {@code "sh_YU"} (deprecated 70 * language and/or region) are accepted. 71 * <p>Examples of valid locale IDs are {@code "en"}, {@code "zh_Hant"}, {@code "fr_CA"}, 72 * {@code "sr_Latn_RS"} and {@code "ja_JP_TRADITIONAL"}. 73 * <p>Examples of invalid locale IDs are {@code ""}, {@code "en_"}, {@code "Latn"} and 74 * {@code "de__TRADITIONAL"}. 75 * 76 * @param localeId the ID the check. 77 * @throws IllegalArgumentException is the ID is invalid. 78 */ checkCldrLocaleId(String localeId)79 public static void checkCldrLocaleId(String localeId) { 80 // This check runs on a lot of locales, so make it as minimal as possible. If normalization 81 // is ever needed, do it in a separate method. 82 checkArgument( 83 LOCALE_ID.matcher(localeId).matches() || localeId.equals("root"), 84 "bad locale ID: %s", 85 localeId); 86 } 87 } 88