1 package org.unicode.cldr.api; 2 3 import static com.google.common.base.Preconditions.checkArgument; 4 5 import java.util.regex.Pattern; 6 7 /** 8 * Utility methods for working with locale IDs as strings. This could, with a little thought, be 9 * made public if necessary. 10 */ 11 final class LocaleIds { 12 // From: https://unicode.org/reports/tr35/#Identifiers 13 // Locale ID is: 14 // (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)* 15 // 16 // However in CLDR data, there's always a language (even if it's "und"), and never more 17 // than one variant, so this can be simplified to: 18 // <language>(_<script>)?(_<region>)?(_<variant>)? 19 // 20 // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw"). 21 // Note that the specification allows for languages 5-8 characters long, but in reality 22 // this has never occurred yet, so it's ignored in this code. 23 // 24 // * Script is 4-letter Xxxx script identifier (e.g. "Latn"). 25 // The specification permits any casing for script subtags, but since all the data uses 26 // the capitalized "Xxxx" form, that's what this code expects. 27 // 28 // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric 29 // identifier (e.g. "001"). 30 // 31 // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting 32 // with a digit (this avoids any ambiguity with script subtags). However because ICU 33 // violates this rule by using "TRADITIONAL" (11-letters) the length restriction is 34 // merely "longer than 5". 35 // 36 // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows 37 // for either '-' or '_'). 38 // 39 // The regex for unambiguously matching a valid locale ID (other than "root") for CLDR data is: 40 private static final Pattern LOCALE_ID = 41 Pattern.compile("(?:[a-z]{2,3})" 42 + "(?:_(?:[A-Z][a-z]{3}))?" 43 + "(?:_(?:[A-Z]{2}|[0-9]{3}))?" 44 + "(?:_(?:[A-Z]{5,}|[0-9][A-Z0-9]{3}))?"); 45 46 /** 47 * Checks whether the given ID is valid for CLDR use (including case). Locale IDs for use in 48 * CLDR APIs are a subset of all possible locale IDs and, unlike general locale IDs, they 49 * are case sensitive. The rules are: 50 * <ul> 51 * <li>A locale ID is up to four subtags {@code 52 * <language>(_<script>)?(_<region>)?(_<variant>)?} 53 * <li>The allowed subtag separator is only ASCII underscore (not hyphen). 54 * <li>The language subtag must exist (though it is permitted to be {@code "und"}). 55 * <li>All other subtags are optional and are separated by a single underscore. 56 * <li>Language subtag is lower-case, and is either 2 or 3 letters (i.e. "[a-z]{2,3}"). 57 * <li>Script subtag is mixed-case and must match {@code "[A-Z][a-z]{3}"}. 58 * <li>Region subtag is upper-case and must match {@code "[A-Z]{2}} or {@code "[0-9]{3}"}. 59 * <li>Variant subtag is upper-case and must match {@code "[A-Z]{5,}} or 60 * {@code "[0-9][A-Z0-9]{3}"}. 61 * <li>The special locale ID {@code "root"} is also permitted. 62 * <ul> 63 * 64 * <p>Note that this check does don't enforce validity in terms of checking for deprecated 65 * languages, regions or script, so things like {@code "sh_YU"} (deprecated language and/or 66 * region) are accepted. 67 * 68 * <p>Examples of valid locale IDs are {@code "en"}, {@code "zh_Hant"}, {@code "fr_CA"}, 69 * {@code "sr_Latn_RS"} and {@code "ja_JP_TRADITIONAL"}. 70 * 71 * <p>Examples of invalid locale IDs are {@code ""}, {@code "en_"}, {@code "Latn"} and 72 * {@code "de__TRADITIONAL"}. 73 * 74 * @param localeId the ID the check. 75 * @throws IllegalArgumentException is the ID is invalid. 76 */ checkCldrLocaleId(String localeId)77 public static void checkCldrLocaleId(String localeId) { 78 // This check runs on a lot of locales, so make it as minimal as possible. If normalization 79 // is ever needed, do it in a separate method. 80 checkArgument(LOCALE_ID.matcher(localeId).matches() || localeId.equals("root"), 81 "bad locale ID: %s", localeId); 82 } 83 84 } 85