• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  * Copyright (c) 2002-2011, International Business Machines
4  * Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  * Author: Mark Davis
7  **********************************************************************
8  */
9 package org.unicode.cldr.util;
10 
11 import java.util.ArrayList;
12 import java.util.Collection;
13 import java.util.Collections;
14 import java.util.Comparator;
15 import java.util.EnumSet;
16 import java.util.Iterator;
17 import java.util.List;
18 import java.util.Locale;
19 import java.util.Map;
20 import java.util.Map.Entry;
21 import java.util.NoSuchElementException;
22 import java.util.Set;
23 import java.util.StringTokenizer;
24 import java.util.TreeMap;
25 import java.util.TreeSet;
26 import java.util.regex.Pattern;
27 
28 import org.unicode.cldr.tool.LikelySubtags;
29 
30 import com.google.common.base.CharMatcher;
31 import com.google.common.base.Joiner;
32 import com.google.common.base.Splitter;
33 import com.google.common.collect.ImmutableList;
34 import com.google.common.collect.ImmutableMap;
35 import com.ibm.icu.dev.util.CollectionUtilities;
36 import com.ibm.icu.impl.Row.R2;
37 import com.ibm.icu.text.UnicodeSet;
38 
39 public class LanguageTagParser {
40 
41     private static final Joiner HYPHEN_JOINER = Joiner.on('-');
42 
43     private static final Comparator<? super String> EXTENSION_ORDER = new Comparator<String>() {
44 
45         @Override
46         public int compare(String o1, String o2) {
47             int diff = getBucket(o1) - getBucket(o2);
48             if (diff != 0) {
49                 return diff;
50             }
51             return o1.compareTo(o2);
52         }
53 
54         private int getBucket(String o1) {
55             switch (o1.length()) {
56             case 1:
57                 return o1.charAt(0) == 't' ? 0 : 2;
58             case 2:
59                 return o1.charAt(1) <= '9' ? 1 : 3;
60             default:
61                 throw new IllegalArgumentException();
62             }
63         }
64     };
65 
66     /**
67      * @return Returns the language, or "" if none.
68      */
getLanguage()69     public String getLanguage() {
70         return language;
71     }
72 
73     /**
74      * @return Returns the script, or "" if none.
75      */
getScript()76     public String getScript() {
77         return script;
78     }
79 
80     /**
81      * @return Returns the region, or "" if none.
82      */
getRegion()83     public String getRegion() {
84         return region;
85     }
86 
87     /**
88      * @return Returns the variants.
89      */
getVariants()90     public List<String> getVariants() {
91         return ImmutableList.copyOf(variants);
92     }
93 
94     /**
95      * @return Returns the grandfathered flag
96      */
isGrandfathered()97     public boolean isGrandfathered() {
98         return grandfathered;
99     }
100 
101     /**
102      * @return Returns the extensions.
103      */
104     @Deprecated
getExtensions()105     public Map<String, String> getExtensions() {
106         return OutputOption.ICU.convert(extensions);
107     }
108 
109     /**
110      * @return Returns the localeExtensions.
111      */
112     @Deprecated
getLocaleExtensions()113     public Map<String, String> getLocaleExtensions() {
114         return OutputOption.ICU.convert(localeExtensions);
115     }
116 
117     /**
118      * @return Returns the extensions.
119      */
getExtensionsDetailed()120     public Map<String, List<String>> getExtensionsDetailed() {
121         return ImmutableMap.copyOf(extensions);
122     }
123 
124     /**
125      * @return Returns the localeExtensions.
126      */
getLocaleExtensionsDetailed()127     public Map<String, List<String>> getLocaleExtensionsDetailed() {
128         return ImmutableMap.copyOf(localeExtensions);
129     }
130 
131     /**
132      * @return Returns the original, preparsed language tag
133      */
getOriginal()134     public String getOriginal() {
135         return original;
136     }
137 
138     /**
139      * @return Returns the language-script (or language) part of a tag.
140      */
getLanguageScript()141     public String getLanguageScript() {
142         if (script.length() != 0) return language + "_" + script;
143         return language;
144     }
145 
146     /**
147      * @param in
148      *            Collection of language tag strings
149      * @return Returns each of the language-script tags in the collection.
150      */
getLanguageScript(Collection<String> in)151     public static Set<String> getLanguageScript(Collection<String> in) {
152         return getLanguageAndScript(in, null);
153     }
154 
155     /**
156      * @param in
157      *            Collection of language tag strings
158      * @return Returns each of the language-script tags in the collection.
159      */
getLanguageAndScript(Collection<String> in, Set<String> output)160     public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) {
161         if (output == null) output = new TreeSet<String>();
162         LanguageTagParser lparser = new LanguageTagParser();
163         for (Iterator<String> it = in.iterator(); it.hasNext();) {
164             output.add(lparser.set(it.next()).getLanguageScript());
165         }
166         return output;
167     }
168 
169     // private fields
170 
171     private String original;
172     private boolean grandfathered = false;
173     private String language;
174     private String script;
175     private String region;
176     private Set<String> variants = new TreeSet<String>();
177     private Map<String, List<String>> extensions = new TreeMap<String, List<String>>(); // use tree map
178     private Map<String, List<String>> localeExtensions = new TreeMap<String, List<String>>(EXTENSION_ORDER);
179 
180     private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze();
181     private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze();
182     private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze();
183     private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze();
184     private static final UnicodeSet X = new UnicodeSet("[xX]").freeze();
185     private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze();
186     private static StandardCodes standardCodes = StandardCodes.make();
187     private static final Set<String> grandfatheredCodes = standardCodes.getAvailableCodes("grandfathered");
188     private static final String separator = "-_"; // '-' alone for 3066bis language tags
189     private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze();
190     private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator));
191     private static final Splitter SPLIT_COLON = Splitter.on(';');
192     private static final Splitter SPLIT_EQUAL = Splitter.on('=');
193     private static SupplementalDataInfo SDI = null; // postpone assignment to avoid re-entrance of SupplementalDataInfo.getInstance
194 
195     /**
196      * Parses out a language tag, setting a number of fields that can subsequently be retrieved.
197      * If a private-use field is found, it is returned as the last extension.<br>
198      * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see
199      * isValid.
200      *
201      * @param languageTag
202      * @return
203      */
set(String languageTag)204     public LanguageTagParser set(String languageTag) {
205         if (languageTag.length() == 0 || languageTag.equals("root")) {
206             // throw new IllegalArgumentException("Language tag cannot be empty");
207             //
208             // With ICU 64 the language tag for root is normalized to empty string so we
209             // cannot throw for empty string as above. However, code here and in clients
210             // assumes a non-empty language tag, so for now just map "" or "root" to "und".
211             languageTag = "und";
212         } else if (languageTag.startsWith("_") || languageTag.startsWith("-")) {
213             languageTag = "und" + languageTag;
214         }
215         languageTag = languageTag.toLowerCase(Locale.ROOT);
216 
217         // clear everything out
218         language = region = script = "";
219         grandfathered = false;
220         variants.clear();
221         extensions.clear();
222         localeExtensions.clear();
223         original = languageTag;
224         int atPosition = languageTag.indexOf('@');
225         if (atPosition >= 0) {
226             final String extensionsString = languageTag.substring(atPosition + 1).toLowerCase(Locale.ROOT);
227             for (String keyValue : SPLIT_COLON.split(extensionsString)) {
228                 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator();
229                 final String key = keyValuePair.next();
230                 final String value = keyValuePair.next();
231                 if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) {
232                     throwError(keyValue, "Invalid key/value pair");
233                 }
234                 List<String> valueList = SPLIT_BAR.splitToList(value);
235                 switch(key.length()) {
236                 case 1:
237                     extensions.put(key, valueList);
238                     break;
239                 case 2:
240                     localeExtensions.put(key, valueList);
241                     break;
242                 default:
243                     throwError(keyValue, "Invalid key/value pair");
244                     break;
245                 }
246             }
247             languageTag = languageTag.substring(0, atPosition);
248         }
249 
250         // first test for grandfathered
251         if (grandfatheredCodes.contains(languageTag)) {
252             language = languageTag;
253             grandfathered = true;
254             return this;
255         }
256 
257         // each time we fetch a token, we check for length from 1..8, and all alphanum
258         StringTokenizer st = new StringTokenizer(languageTag, separator);
259         String subtag;
260         try {
261             subtag = getSubtag(st);
262         } catch (Exception e1) {
263             throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1);
264         }
265 
266         // check for private use (x-...) and return if so
267         if (subtag.equalsIgnoreCase("x")) {
268             getExtension(subtag, st, 1);
269             return this;
270         }
271 
272         // check that language subtag is valid
273         if (!ALPHA.containsAll(subtag) || subtag.length() < 2) {
274             throwError(subtag, "Invalid language subtag");
275         }
276         try { // The try block is to catch the out-of-tokens case. Easier than checking each time.
277             language = subtag;
278             subtag = getSubtag(st); // prepare for next
279 
280             // check for script, 4 letters
281             if (subtag.length() == 4 && ALPHA.containsAll(subtag)) {
282                 script = subtag;
283                 script = script.substring(0, 1).toUpperCase(Locale.ROOT)
284                     + script.substring(1);
285                 subtag = getSubtag(st); // prepare for next
286             }
287 
288             // check for region, 2 letters or 3 digits
289             if (subtag.length() == 2 && ALPHA.containsAll(subtag)
290                 || subtag.length() == 3 && DIGIT.containsAll(subtag)) {
291                 region = subtag.toUpperCase(Locale.ENGLISH);
292                 subtag = getSubtag(st); // prepare for next
293             }
294 
295             // get variants: length > 4 or len=4 & starts with digit
296             while (isValidVariant(subtag)) {
297                 variants.add(subtag);
298                 subtag = getSubtag(st); // prepare for next
299             }
300 
301             // get extensions: singleton '-' subtag (2-8 long)
302             while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) {
303                 subtag = getExtension(subtag, st, 2);
304                 if (subtag == null) return this; // done
305             }
306 
307             if (subtag.equalsIgnoreCase("x")) {
308                 getExtension(subtag, st, 1);
309                 return this;
310             }
311 
312             // if we make it to this point, then we have an error
313             throwError(subtag, "Illegal subtag");
314 
315         } catch (NoSuchElementException e) {
316             // this exception just means we ran out of tokens. That's ok, so we just return.
317         }
318         return this;
319     }
320 
isValidVariant(String subtag)321     private boolean isValidVariant(String subtag) {
322         return subtag != null && ALPHANUM.containsAll(subtag)
323             && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0)));
324     }
325 
326     /**
327      *
328      * @return true iff the language tag validates
329      */
isValid()330     public boolean isValid() {
331         if (grandfathered) return true; // don't need further checking, since we already did so when parsing
332         if (!validates(language, "language")) return false;
333         if (!validates(script, "script")) return false;
334         if (!validates(region, "territory")) return false;
335         for (Iterator<String> it = variants.iterator(); it.hasNext();) {
336             if (!validates(it.next(), "variant")) return false;
337         }
338         return true; // passed the gauntlet
339     }
340 
341     public enum Status {
342         WELL_FORMED, VALID, CANONICAL, MINIMAL
343     }
344 
getStatus(Set<String> errors)345     public Status getStatus(Set<String> errors) {
346         errors.clear();
347         if (!isValid()) {
348             return Status.WELL_FORMED;
349             // TODO, check the bcp47 extension codes also
350         }
351 
352         if (SDI == null) {
353             SDI = SupplementalDataInfo.getInstance();
354         }
355         Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo();
356         Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language");
357 
358         if (aliasInfo.get("language").containsKey(language)) {
359             errors.add("Non-canonical language: " + language);
360         }
361         Map<String, String> lstrInfo = languageInfo.get(language);
362         if (lstrInfo != null) {
363             String scope = lstrInfo.get("Scope");
364             if ("collection".equals(scope)) {
365                 errors.add("Collection language: " + language);
366             }
367         }
368         if (aliasInfo.get("script").containsKey(script)) {
369             errors.add("Non-canonical script: " + script);
370         }
371         if (aliasInfo.get("territory").containsKey(region)) {
372             errors.add("Non-canonical region: " + region);
373         }
374         if (!errors.isEmpty()) {
375             return Status.VALID;
376         }
377         String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region);
378         String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false);
379         if (minimized == null) {
380             errors.add("No minimal data for:" + tag);
381             if (script.isEmpty() && region.isEmpty()) {
382                 return Status.MINIMAL;
383             } else {
384                 return Status.CANONICAL;
385             }
386         }
387         if (!tag.equals(minimized)) {
388             errors.add("Not minimal:" + tag + "-->" + minimized);
389             return Status.CANONICAL;
390         }
391         return Status.MINIMAL;
392     }
393 
394     /**
395      * @param subtag
396      * @param type
397      * @return true if the subtag is empty, or if it is in the registry
398      */
validates(String subtag, String type)399     private boolean validates(String subtag, String type) {
400         return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag);
401     }
402 
403     /**
404      * Internal method
405      *
406      * @param minLength
407      *            TODO
408      */
getExtension(String subtag, StringTokenizer st, int minLength)409     private String getExtension(String subtag, StringTokenizer st, int minLength) {
410         String base = subtag;
411         final char extension = subtag.charAt(0);
412         if (extensions.containsKey(subtag)) {
413             throwError(subtag, "Can't have two extensions with the same key");
414         }
415         if (!st.hasMoreElements()) {
416             throwError(subtag, "Private Use / Extension requires subsequent subtag");
417         }
418         boolean takesSubkeys = extension == 'u' || extension == 't';
419         boolean firstT = extension == 't';
420         boolean haveContents = false;
421         List<String> result = new ArrayList<>();
422         try {
423             while (st.hasMoreElements()) {
424                 subtag = getSubtag(st);
425                 if (subtag.length() < minLength) {
426                     return subtag;
427                 }
428                 if (takesSubkeys
429                     && subtag.length() == 2
430                     && (!firstT || isTKey(subtag))) { // start new key-value pair
431                     if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u-
432                         localeExtensions.put(base, ImmutableList.copyOf(result));
433                         haveContents = true;
434                         result.clear();
435                     }
436                     base = subtag;
437                     continue;
438                 }
439                 firstT = false;
440                 result.add(subtag);
441             }
442             return null;
443         } finally {
444             if (takesSubkeys) {
445                 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u-
446                     localeExtensions.put(base, ImmutableList.copyOf(result));
447                     haveContents = true;
448                 }
449                 if (!haveContents) {
450                     throw new IllegalArgumentException("extension must not be empty: " + base);
451                 }
452             } else {
453                 if (result.isEmpty()) {
454                     throw new IllegalArgumentException("extension must not be empty: " + base);
455                 }
456                 extensions.put(base, ImmutableList.copyOf(result));
457             }
458         }
459     }
460 
461     /**
462      * Internal method
463      */
getSubtag(StringTokenizer st)464     private String getSubtag(StringTokenizer st) {
465         String result = st.nextToken();
466         if (result.length() < 1 || result.length() > 8) {
467             throwError(result, "Illegal length (must be 1..8)");
468         }
469         if (!ALPHANUM.containsAll(result)) {
470             throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")");
471         }
472         return result;
473     }
474 
475     /**
476      * Internal method
477      */
throwError(String subtag, String errorText)478     private void throwError(String subtag, String errorText) {
479         throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original);
480     }
481 
setRegion(String region)482     public LanguageTagParser setRegion(String region) {
483         this.region = region;
484         return this;
485     }
486 
setScript(String script)487     public LanguageTagParser setScript(String script) {
488         this.script = script;
489         return this;
490     }
491 
492     public enum OutputOption {
493         ICU('_'), BCP47('-');
494         final char separator;
495         final Joiner joiner;
496 
OutputOption(char separator)497         private OutputOption(char separator) {
498             this.separator = separator;
499             joiner = Joiner.on(separator);
500         }
501 
convert(Map<String, List<String>> mapToList)502         public Map<String, String> convert(Map<String, List<String>> mapToList) {
503             if (mapToList.isEmpty()) {
504                 return Collections.emptyMap();
505             }
506             ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
507             for (Entry<String, List<String>> entry : mapToList.entrySet()) {
508                 builder.put(entry.getKey(), joiner.join(entry.getValue()));
509             }
510             return builder.build();
511         }
512     }
513 
toString()514     public String toString() {
515         return toString(OutputOption.ICU);
516     }
517 
toString(OutputOption oo)518     public String toString(OutputOption oo) {
519         StringBuilder result = new StringBuilder(language); // optimize for the simple cases
520         if (this.script.length() != 0) result.append(oo.separator).append(script);
521         if (this.region.length() != 0) result.append(oo.separator).append(region);
522         if (this.variants.size() != 0) {
523             for (String variant : variants) {
524                 result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT));
525             }
526         }
527         boolean haveAt = false;
528         boolean needSep = false;
529 
530         StringBuilder extensionsAfterU = null;
531         StringBuilder extensionX = null;
532         if (this.extensions.size() != 0) {
533             StringBuilder target = result;
534             for (Entry<String, List<String>> extension : extensions.entrySet()) {
535                 String key = extension.getKey();
536                 String value = oo.joiner.join(extension.getValue());
537                 switch (key) {
538                 case "v":
539                 case "w":
540                 case "y":
541                 case "z":
542                     if (extensionsAfterU == null) {
543                         extensionsAfterU = new StringBuilder();
544                     }
545                     target = extensionsAfterU;
546                     break;
547                 case "x":
548                     if (extensionX == null) {
549                         extensionX = new StringBuilder();
550                     }
551                     target = extensionX;
552                     break;
553                 default:
554                     // no action; we already have target set right for earlier items.
555                 }
556                 if (oo == OutputOption.BCP47) {
557                     target.append(oo.separator).append(key)
558                     .append(oo.separator).append(value);
559                 } else {
560                     if (!haveAt) {
561                         target.append('@');
562                         haveAt = true;
563                     }
564                     if (needSep) {
565                         target.append(";");
566                     } else {
567                         needSep = true;
568                     }
569                     target.append(key)
570                     .append('=').append(value);
571                 }
572             }
573         }
574         if (this.localeExtensions.size() != 0) {
575             if (oo == OutputOption.BCP47) {
576                 List<String> tValue = localeExtensions.get("t");
577                 if (tValue != null) {
578                     result.append(oo.separator).append('t')
579                     .append(oo.separator).append(oo.joiner.join(tValue));
580                     for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
581                         String key = extension.getKey();
582                         if (isTKey(key)) {
583                             String value = oo.joiner.join(extension.getValue());
584                             result.append(oo.separator).append(key).append(oo.separator).append(value);
585                         }
586                     }
587                 }
588                 boolean haveU = false;
589                 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
590                     if (!haveU) {
591                         List<String> uValue = localeExtensions.get("u");
592                         result.append(oo.separator).append('u');
593                         if (uValue != null) {
594                             result.append(oo.separator).append(oo.joiner.join(uValue));
595                         }
596                         haveU = true;
597                     }
598                     String key = extension.getKey();
599                     if (key.length() == 2 && key.charAt(1) >= 'a') {
600                         String value = oo.joiner.join(extension.getValue());
601                         result.append(oo.separator).append(key).append(oo.separator).append(value);
602                     }
603                 }
604             } else {
605                 if (!haveAt) {
606                     result.append('@');
607                 }
608                 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
609                     if (needSep) {
610                         result.append(";");
611                     } else {
612                         needSep = true;
613                     }
614                     String key = extension.getKey();
615                     String value = oo.joiner.join(extension.getValue());
616                     result.append(key.toUpperCase(Locale.ROOT))
617                     .append('=').append(value.toUpperCase(Locale.ROOT));
618                 }
619             }
620         }
621         // do extensions after u, with x last
622         if (extensionsAfterU != null) {
623             result.append(extensionsAfterU);
624         }
625         if (extensionX != null) {
626             result.append(extensionX);
627         }
628         return result.toString();
629     }
630 
isTKey(String key)631     public static boolean isTKey(String key) {
632         return key.length() == 2 && key.charAt(1) < 'a';
633     }
634 
hasT()635     public boolean hasT() {
636         for (String key : localeExtensions.keySet()) {
637             if (key.equals("t") || isTKey(key)) {
638                 return true;
639             }
640         }
641         return false;
642     }
643 
644     /**
645      * Return just the language, script, and region (no variants or extensions)
646      * @return
647      */
toLSR()648     public String toLSR() {
649         String result = language; // optimize for the simple cases
650         if (this.script.length() != 0) result += "_" + script;
651         if (this.region.length() != 0) result += "_" + region;
652         return result;
653     }
654 
655     public enum Fields {
656         LANGUAGE, SCRIPT, REGION, VARIANTS
657     };
658 
659     public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT));
660     public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION));
661     public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE,
662         Fields.SCRIPT, Fields.REGION));
663 
toString(Set<Fields> selection)664     public String toString(Set<Fields> selection) {
665         String result = language;
666         if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script;
667         if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
668         if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
669             for (String variant : (Collection<String>) variants) {
670                 result += "_" + variant;
671             }
672         }
673         return result;
674     }
675 
setLanguage(String language)676     public LanguageTagParser setLanguage(String language) {
677         if (SEPARATORS.containsSome(language)) {
678             String oldScript = script;
679             String oldRegion = region;
680             Set<String> oldVariants = variants;
681             set(language);
682             if (script.length() == 0) {
683                 script = oldScript;
684             }
685             if (region.length() == 0) {
686                 region = oldRegion;
687             }
688             if (oldVariants.size() != 0) {
689                 variants = oldVariants;
690             }
691         } else {
692             this.language = language;
693         }
694         return this;
695     }
696 
setLocaleExtensions(Map<String, String> localeExtensions)697     public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) {
698         this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE);
699         return this;
700     }
701 
setVariants(Collection<String> newVariants)702     public LanguageTagParser setVariants(Collection<String> newVariants) {
703         for (String variant : newVariants) {
704             if (!isValidVariant(variant)) {
705                 throw new IllegalArgumentException("Illegal variant: " + variant);
706             }
707         }
708         variants.clear();
709         variants.addAll(newVariants);
710         return this;
711     }
712 
713     static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?");
714 
setExtensions(Map<String, String> newExtensions)715     public LanguageTagParser setExtensions(Map<String, String> newExtensions) {
716         this.extensions = expandMap(newExtensions, 2, 8);
717         return this;
718     }
719 
getSimpleParent(String s)720     public static String getSimpleParent(String s) {
721         int lastBar = s.lastIndexOf('_');
722         return lastBar >= 0 ? s.substring(0, lastBar) : "";
723     }
724 
expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength)725     private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) {
726         if (newLocaleExtensions.isEmpty()) {
727             return Collections.emptyMap();
728         }
729         ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder();
730         for (Entry<String, String> entry : newLocaleExtensions.entrySet()) {
731             result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength));
732         }
733         return result.build();
734     }
735 
split(String value, int minLength, int maxLength)736     private List<String> split(String value, int minLength, int maxLength) {
737         List<String> values = SPLIT_BAR.splitToList(value);
738         for (String s : values) {
739             if (s.length() < minLength || s.length() > maxLength) {
740                 throw new IllegalArgumentException("Illegal subtag length for: " + s);
741             }
742             if (!ALPHANUM.containsAll(s)) {
743                 throw new IllegalArgumentException("Illegal locale character in: " + s);
744             }
745         }
746         return values;
747     }
748 
749     public enum Format {icu("_","_"), bcp47("-","-"), structure("; ", "=");
750         public final String separator;
751         public final String separator2;
Format(String separator, String separator2)752         private Format(String separator, String separator2) {
753             this.separator = separator;
754             this.separator2 = separator2;
755         }
756     };
757 
toString(Format format)758     public String toString(Format format) {
759         StringBuilder result = new StringBuilder();
760         if (format == Format.structure) {
761             result.append("[");
762         }
763         appendField(format, result, "language", language);
764         appendField(format, result, "script", script);
765         appendField(format, result, "region", region);
766         appendField(format, result, "variants", variants);
767         appendField(format, result, "extensions", extensions, new UnicodeSet('a','s'));
768         appendField(format, result, "localeX", localeExtensions, null);
769         appendField(format, result, "extensions", extensions,  new UnicodeSet('v','w', 'y','z'));
770         appendField(format, result, "extensions", extensions, new UnicodeSet('x','x'));
771         if (format == Format.structure) {
772             result.append("]");
773         }
774 //            if (script.length() != 0) {
775 //                result. += "_" + script;
776 //            }
777 //            if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
778 //            if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
779 //                for (String variant : (Collection<String>) variants) {
780 //                    result += "_" + variant;
781 //                }
782 //            }
783         return result.toString();
784     }
785 
appendField(Format format, StringBuilder result, String fieldName, String fieldValue)786     private void appendField(Format format, StringBuilder result, String fieldName, String fieldValue) {
787         if (!fieldValue.isEmpty()) {
788             if (result.length() > 1) {
789                 result.append(format.separator);
790             }
791             if (format == Format.structure) {
792                 result.append(fieldName).append("=");
793             }
794             result.append(fieldValue);
795         }
796     }
797 
appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue)798     private void appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue) {
799         result.append(format.separator).append(fieldName).append(format.separator2).append(fieldValue);
800     }
801 
appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues)802     private void appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues) {
803         if (!fieldValues.isEmpty()) {
804             appendField(format, result, fieldName, CollectionUtilities.join(fieldValues, ","));
805         }
806     }
807 
808     /**
809      * null match means it is -t- or -u-
810      */
appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match)811     private void appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match) {
812         if (match == null && format != Format.structure) {
813             List<String> tLang = fieldValues.get("t");
814             List<String> uSpecial = fieldValues.get("u");
815             boolean haveTLang = tLang != null;
816             boolean haveUSpecial = uSpecial != null;
817 
818             // do all the keys ending with digits first
819             boolean haveT = false;
820             boolean haveU = false;
821             StringBuilder result2 = new StringBuilder(); // put -u- at end
822             for (Entry<String, List<String>> entry : fieldValues.entrySet()) {
823                 String key = entry.getKey();
824                 if (key.length() < 2) {
825                     continue;
826                 }
827                 int lastChar = key.codePointBefore(key.length());
828                 if (lastChar < 'a') {
829                     if (!haveT) {
830                         result.append(format.separator).append('t');
831                         if (haveTLang) { // empty is illegal, but just in case
832                             result.append(format.separator).append(CollectionUtilities.join(tLang, format.separator));
833                             haveTLang = false;
834                         }
835                         haveT = true;
836                     }
837                     appendFieldKey(format, result, entry.getKey(), CollectionUtilities.join(entry.getValue(), format.separator));
838                 } else {
839                     if (!haveU) {
840                         result2.append(format.separator).append('u');
841                         if (haveUSpecial) { // not yet valid, but just in case
842                             result2.append(format.separator).append(CollectionUtilities.join(uSpecial, format.separator));
843                             haveUSpecial = false;
844                         }
845                         haveU = true;
846                     }
847                     appendFieldKey(format, result2, entry.getKey(), CollectionUtilities.join(entry.getValue(), format.separator));
848                 }
849             }
850             if (haveTLang) {
851                 result.append(format.separator).append('t').append(format.separator).append(CollectionUtilities.join(tLang, format.separator));
852             }
853             if (haveUSpecial) {
854                 result2.append(format.separator).append('u').append(format.separator).append(CollectionUtilities.join(uSpecial, format.separator));
855             }
856             result.append(result2); // put in right order
857         } else {
858             for (Entry<String, List<String>> entry : fieldValues.entrySet()) {
859                 if (match == null || match.contains(entry.getKey())) {
860                     appendFieldKey(format, result, entry.getKey(), CollectionUtilities.join(entry.getValue(), format.separator));
861                 }
862             }
863         }
864     }
865 }