• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.util.Arrays;
4 import java.util.Collections;
5 import java.util.HashSet;
6 import java.util.LinkedHashMap;
7 import java.util.List;
8 import java.util.Locale;
9 import java.util.Map;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
12 
13 import org.unicode.cldr.util.PatternCache;
14 
15 /**
16  * Parse Locales, extended to BCP 47 and CLDR. Also normalizes the case of the results.
17  * Only does syntactic parse: does not replace deprecated elements; does not check for validity.
18  * Will throw IllegalArgumentException for duplicate variants and extensions.
19  *
20  * @author markdavis
21  */
22 class SimpleLocaleParser {
23     // mechanically generated regex -- don't worry about trying to read it!
24     // if we want to allow multiple --, change [-_] into [-_]+
25     private static final Pattern rootPattern = Pattern.compile(
26         "(?:"
27             +
28             " (?: ( [a-z]{2,8} )"
29             + // language
30             "   (?: [-_] ( [a-z]{4} ) )?"
31             + // script
32             "   (?: [-_] ( [a-z]{2} | [0-9]{3} ) )?"
33             + // region
34             "   (?: [-_] ( (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) (?: [-_] (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) )* ) )?"
35             + // variant(s)
36             "   (?: [-_] ( [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ (?: [-_] [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ )* ) )?"
37             + // extensions
38             "   (?: [-_] ( x (?: [-_] [a-z 0-9]{1,8} )+ ) )? )"
39             + // private use
40             " | ( x (?: [-_] [a-z 0-9]{1,8} )+ )"
41             + // private use
42             " | ( en [-_] GB [-_] oed"
43             + // grandfathered gorp
44             "   | i [-_] (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )"
45             +
46             "   | no [-_] (?: bok | nyn )" +
47             "   | sgn [-_] (?: BE [-_] (?: fr | nl) | CH [-_] de )" +
48             "   | zh [-_] (?: cmn (?: [-_] Hans | [-_] Hant )? | gan | min (?: [-_] nan)? | wuu | yue ) ) )" +
49             " (?: \\@ ((?: [a-z 0-9]+ \\= [a-z 0-9]+) (?: \\; (?: [a-z 0-9]+ \\= [a-z 0-9]+))*))?" + // CLDR/ICU
50             // keywords
51             "",
52         Pattern.COMMENTS | Pattern.CASE_INSENSITIVE); // TODO change above to be lowercase, since source is
53     // already when we compare
54     // Other regex patterns for splitting apart lists of items detected above.
55     private static final Pattern variantSeparatorPattern = PatternCache.get("[-_]");
56     private static final Pattern extensionPattern = Pattern.compile(
57         "([a-z]) [-_] ( [a-z 0-9]{2,8} (?:[-_] [a-z 0-9]{2,8})* )", Pattern.COMMENTS);
58     private static final Pattern privateUsePattern = Pattern.compile(
59         "(x) [-_] ( [a-z 0-9]{1,8} (?:[-_] [a-z 0-9]{1,8})* )", Pattern.COMMENTS);
60     private static final Pattern keywordPattern = Pattern.compile("([a-z 0-9]+) \\= ([a-z 0-9]+)", Pattern.COMMENTS);
61 
62     /**
63      * The fields set by set().
64      */
65     private String language;
66     private String script;
67     private String region;
68     private List<String> variants;
69     private Map<String, String> extensions;
70 
71     /**
72      * Set the object to the source.
73      * <p>
74      * Example (artificially complicated):
75      *
76      * <pre>
77      * myParser.set(&quot;zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic&quot;);
78      * String language = myParser.getLanguage();
79      * </pre>
80      *
81      * @param source
82      * @return
83      */
set(String source)84     public boolean set(String source) {
85         source = source.toLowerCase(Locale.ENGLISH);
86         Matcher root = rootPattern.matcher(source);
87         if (!root.matches()) {
88             return false;
89         }
90         language = root.group(1);
91         if (language == null) {
92             language = root.group(8); // grandfathered
93             if (language == null) {
94                 language = "und"; // placeholder for completely private use
95             }
96         }
97         script = root.group(2);
98         if (script == null) {
99             script = "";
100         } else {
101             script = script.substring(0, 1).toUpperCase(Locale.ENGLISH) + script.substring(1);
102         }
103         region = root.group(3);
104         if (region == null) {
105             region = "";
106         } else {
107             region = region.toUpperCase(Locale.ENGLISH);
108         }
109         final String variantList = root.group(4);
110         if (variantList == null) {
111             variants = Collections.emptyList();
112         } else {
113             // make uppercase for compatibility with CLDR.
114             variants = Arrays.asList(variantSeparatorPattern.split(variantList.toUpperCase(Locale.ENGLISH)));
115             // check for duplicate variants
116             if (new HashSet<String>(variants).size() != variants.size()) {
117                 throw new IllegalArgumentException("Duplicate variants");
118             }
119         }
120         extensions = new LinkedHashMap<String, String>(); // group 5 are extensions, 6 is private use
121         // extensions are a bit more complicated
122         addExtensions(root.group(5), extensionPattern);
123         addExtensions(root.group(6), privateUsePattern);
124         addExtensions(root.group(7), privateUsePattern);
125         addExtensions(root.group(9), keywordPattern);
126         extensions = Collections.unmodifiableMap(extensions);
127         return true;
128     }
129 
addExtensions(String item, Pattern pattern)130     private void addExtensions(String item, Pattern pattern) {
131         if (item != null) {
132             Matcher extension = pattern.matcher(item);
133             while (extension.find()) {
134                 final String key = extension.group(1);
135                 // check for duplicate keys
136                 if (extensions.containsKey(key)) {
137                     throw new IllegalArgumentException("duplicate key: " + key);
138                 }
139                 extensions.put(key, extension.group(2));
140             }
141         }
142     }
143 
144     /**
145      * Return BCP 47 language subtag (may be ISO registered code).
146      * If the language tag is irregular, then the entire tag is in the language field.
147      * If the entire code is private use, then the language code is "und".
148      * Examples:
149      * <table style="border-width:1; border-style:collapse">
150      * <tr>
151      * <th>Input String</th>
152      * <th>Parsed</th>
153      * </tr>
154      * <tr>
155      * <td>zh-cmn-Hans</td>
156      * <td>{language=zh-cmn-hans, script=, country=, variants=[], keywords={}}</td>
157      * </tr>
158      * <tr>
159      * <td>i-default@abc=def</td>
160      * <td>{language=i-default, script=, country=, variants=[], keywords={abc=def}}</td>
161      * </tr>
162      * <tr>
163      * <td>x-foobar@abc=def</td>
164      * <td>{language=und, script=, country=, variants=[], keywords={x=foobar, abc=def}}</td>
165      * </tr>
166      * </table>
167      *
168      * @return language subtag, lowercased.
169      */
getLanguage()170     public String getLanguage() {
171         return language;
172     }
173 
174     /**
175      * Return BCP 47 script subtag (may be ISO or UN)
176      *
177      * @return script subtag, titlecased.
178      */
getScript()179     public String getScript() {
180         return script;
181     }
182 
183     /**
184      * Return BCP 47 region subtag (may be ISO or UN)
185      *
186      * @return country (region) subtag, uppercased.
187      */
getCountry()188     public String getCountry() {
189         return region;
190     }
191 
192     /**
193      * Return immutable list of BCP 47 variants
194      *
195      * @return list of uppercased variants.
196      */
getVariants()197     public List<String> getVariants() {
198         return variants;
199     }
200 
201     /**
202      * Return the first variant, for compatibility
203      *
204      * @return first (uppercased) variant
205      */
getVariant()206     public String getVariant() {
207         return variants.size() == 0 ? "" : variants.iterator().next();
208     }
209 
210     /**
211      * Return immutable map of key/value extensions. Includes BCP 47 extensions and private use, also locale keyword
212      * extensions. If the entire code is private use,
213      * then the language is set to "und" for consistency.
214      * <p>
215      * Example:
216      * <table style="border-width:1; border-style:collapse">
217      * <tr>
218      * <th>Input String</th>
219      * <th>Parsed</th>
220      * </tr>
221      * <tr>
222      * <td>zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic</td>
223      * <td>{language=zh, script=Hans, country=HK, variants=[SCOUSE], keywords={a=foobar, x=a-en, collation=phonebook,
224      * calendar=islamic}}</td>
225      * </tr>
226      * </table>
227      *
228      * @return map of key/value pairs, lowercased.
229      */
getExtensions()230     public Map<String, String> getExtensions() {
231         return extensions;
232     }
233 
toString()234     public String toString() {
235         return "{language=" + language
236             + ", script=" + script
237             + ", country=" + region
238             + ", variants=" + variants
239             + ", keywords=" + extensions
240             + "}";
241     }
242 }