• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 2003-2011, International Business Machines Corporation and   *
6 * others. All Rights Reserved.                                               *
7 ******************************************************************************
8 */
9 
10 package com.ibm.icu.impl;
11 
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.Iterator;
15 import java.util.Map;
16 import java.util.TreeMap;
17 
18 import com.ibm.icu.impl.locale.AsciiUtil;
19 
20 /**
21  * Utility class to parse and normalize locale ids (including POSIX style)
22  */
23 public final class LocaleIDParser {
24 
25     /**
26      * Char array representing the locale ID.
27      */
28     private char[] id;
29 
30     /**
31      * Current position in {@link #id} (while parsing).
32      */
33     private int index;
34 
35     /**
36      * Temporary buffer for parsed sections of data.
37      */
38     private StringBuilder buffer;
39 
40     // um, don't handle POSIX ids unless we request it.  why not?  well... because.
41     private boolean canonicalize;
42     private boolean hadCountry;
43 
44     // used when canonicalizing
45     Map<String, String> keywords;
46     String baseName;
47 
48     /**
49      * Parsing constants.
50      */
51     private static final char KEYWORD_SEPARATOR     = '@';
52     private static final char HYPHEN                = '-';
53     private static final char KEYWORD_ASSIGN        = '=';
54     private static final char COMMA                 = ',';
55     private static final char ITEM_SEPARATOR        = ';';
56     private static final char DOT                   = '.';
57     private static final char UNDERSCORE            = '_';
58 
LocaleIDParser(String localeID)59     public LocaleIDParser(String localeID) {
60         this(localeID, false);
61     }
62 
LocaleIDParser(String localeID, boolean canonicalize)63     public LocaleIDParser(String localeID, boolean canonicalize) {
64         id = localeID.toCharArray();
65         index = 0;
66         buffer = new StringBuilder(id.length + 5);
67         this.canonicalize = canonicalize;
68     }
69 
reset()70     private void reset() {
71         index = 0;
72         buffer = new StringBuilder(id.length + 5);
73     }
74 
75     // utilities for working on text in the buffer
76 
77     /**
78      * Append c to the buffer.
79      */
append(char c)80     private void append(char c) {
81         buffer.append(c);
82     }
83 
addSeparator()84     private void addSeparator() {
85         append(UNDERSCORE);
86     }
87 
88     /**
89      * Returns the text in the buffer from start to blen as a String.
90      */
getString(int start)91     private String getString(int start) {
92         return buffer.substring(start);
93     }
94 
95     /**
96      * Set the length of the buffer to pos, then append the string.
97      */
set(int pos, String s)98     private void set(int pos, String s) {
99         buffer.delete(pos, buffer.length());
100         buffer.insert(pos, s);
101     }
102 
103     /**
104      * Append the string to the buffer.
105      */
append(String s)106     private void append(String s) {
107         buffer.append(s);
108     }
109 
110     // utilities for parsing text out of the id
111 
112     /**
113      * Character to indicate no more text is available in the id.
114      */
115     private static final char DONE = '\uffff';
116 
117     /**
118      * Returns the character at index in the id, and advance index.  The returned character
119      * is DONE if index was at the limit of the buffer.  The index is advanced regardless
120      * so that decrementing the index will always 'unget' the last character returned.
121      */
next()122     private char next() {
123         if (index == id.length) {
124             index++;
125             return DONE;
126         }
127 
128         return id[index++];
129     }
130 
131     /**
132      * Advance index until the next terminator or id separator, and leave it there.
133      */
skipUntilTerminatorOrIDSeparator()134     private void skipUntilTerminatorOrIDSeparator() {
135         while (!isTerminatorOrIDSeparator(next()));
136         --index;
137     }
138 
139     /**
140      * Returns true if the character at index in the id is a terminator.
141      */
atTerminator()142     private boolean atTerminator() {
143         return index >= id.length || isTerminator(id[index]);
144     }
145 
146     /**
147      * Returns true if the character is a terminator (keyword separator, dot, or DONE).
148      * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
149      */
isTerminator(char c)150     private boolean isTerminator(char c) {
151         // always terminate at DOT, even if not handling POSIX.  It's an error...
152         return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
153     }
154 
155     /**
156      * Returns true if the character is a terminator or id separator.
157      */
isTerminatorOrIDSeparator(char c)158     private boolean isTerminatorOrIDSeparator(char c) {
159         return c == UNDERSCORE || c == HYPHEN || isTerminator(c);
160     }
161 
162     /**
163      * Returns true if the start of the buffer has an experimental or private language
164      * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
165      */
haveExperimentalLanguagePrefix()166     private boolean haveExperimentalLanguagePrefix() {
167         if (id.length > 2) {
168             char c = id[1];
169             if (c == HYPHEN || c == UNDERSCORE) {
170                 c = id[0];
171                 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
172             }
173         }
174         return false;
175     }
176 
177     /**
178      * Returns true if a value separator occurs at or after index.
179      */
haveKeywordAssign()180     private boolean haveKeywordAssign() {
181         // assume it is safe to start from index
182         for (int i = index; i < id.length; ++i) {
183             if (id[i] == KEYWORD_ASSIGN) {
184                 return true;
185             }
186         }
187         return false;
188     }
189 
190     /**
191      * Advance index past language, and accumulate normalized language code in buffer.
192      * Index must be at 0 when this is called.  Index is left at a terminator or id
193      * separator.  Returns the start of the language code in the buffer.
194      */
parseLanguage()195     private int parseLanguage() {
196         int startLength = buffer.length();
197 
198         if (haveExperimentalLanguagePrefix()) {
199             append(AsciiUtil.toLower(id[0]));
200             append(HYPHEN);
201             index = 2;
202         }
203 
204         char c;
205         while(!isTerminatorOrIDSeparator(c = next())) {
206             append(AsciiUtil.toLower(c));
207         }
208         --index; // unget
209 
210         if (buffer.length() - startLength == 3) {
211             String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
212             if (lang != null) {
213                 set(0, lang);
214             }
215         }
216 
217         return 0;
218     }
219 
220     /**
221      * Advance index past language.  Index must be at 0 when this is called.  Index
222      * is left at a terminator or id separator.
223      */
skipLanguage()224     private void skipLanguage() {
225         if (haveExperimentalLanguagePrefix()) {
226             index = 2;
227         }
228         skipUntilTerminatorOrIDSeparator();
229     }
230 
231     /**
232      * Advance index past script, and accumulate normalized script in buffer.
233      * Index must be immediately after the language.
234      * If the item at this position is not a script (is not four characters
235      * long) leave index and buffer unchanged.  Otherwise index is left at
236      * a terminator or id separator.  Returns the start of the script code
237      * in the buffer (this may be equal to the buffer length, if there is no
238      * script).
239      */
parseScript()240     private int parseScript() {
241         if (!atTerminator()) {
242             int oldIndex = index; // save original index
243             ++index;
244 
245             int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone
246             char c;
247             boolean firstPass = true;
248             while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) {
249                 if (firstPass) {
250                     addSeparator();
251                     append(AsciiUtil.toUpper(c));
252                     firstPass = false;
253                 } else {
254                     append(AsciiUtil.toLower(c));
255                 }
256             }
257             --index; // unget
258 
259             /* If it's not exactly 4 characters long, then it's not a script. */
260             if (index - oldIndex != 5) { // +1 to account for separator
261                 index = oldIndex;
262                 buffer.delete(oldBlen, buffer.length());
263             } else {
264                 oldBlen++; // index past hyphen, for clients who want to extract just the script
265             }
266 
267             return oldBlen;
268         }
269         return buffer.length();
270     }
271 
272     /**
273      * Advance index past script.
274      * Index must be immediately after the language and IDSeparator.
275      * If the item at this position is not a script (is not four characters
276      * long) leave index.  Otherwise index is left at a terminator or
277      * id separator.
278      */
skipScript()279     private void skipScript() {
280         if (!atTerminator()) {
281             int oldIndex = index;
282             ++index;
283 
284             char c;
285             while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c));
286             --index;
287 
288             if (index - oldIndex != 5) { // +1 to account for separator
289                 index = oldIndex;
290             }
291         }
292     }
293 
294     /**
295      * Advance index past country, and accumulate normalized country in buffer.
296      * Index must be immediately after the script (if there is one, else language)
297      * and IDSeparator.  Return the start of the country code in the buffer.
298      */
parseCountry()299     private int parseCountry() {
300         if (!atTerminator()) {
301             int oldIndex = index;
302             ++index;
303 
304             int oldBlen = buffer.length();
305             char c;
306             boolean firstPass = true;
307             while (!isTerminatorOrIDSeparator(c = next())) {
308                 if (firstPass) { // first, add hyphen
309                     hadCountry = true; // we have a country, let variant parsing know
310                     addSeparator();
311                     ++oldBlen; // increment past hyphen
312                     firstPass = false;
313                 }
314                 append(AsciiUtil.toUpper(c));
315             }
316             --index; // unget
317 
318             int charsAppended = buffer.length() - oldBlen;
319 
320             if (charsAppended == 0) {
321                 // Do nothing.
322             }
323             else if (charsAppended < 2 || charsAppended > 3) {
324                 // It's not a country, so return index and blen to
325                 // their previous values.
326                 index = oldIndex;
327                 --oldBlen;
328                 buffer.delete(oldBlen, buffer.length());
329                 hadCountry = false;
330             }
331             else if (charsAppended == 3) {
332                 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
333                 if (region != null) {
334                     set(oldBlen, region);
335                 }
336             }
337 
338             return oldBlen;
339         }
340 
341         return buffer.length();
342     }
343 
344     /**
345      * Advance index past country.
346      * Index must be immediately after the script (if there is one, else language)
347      * and IDSeparator.
348      */
skipCountry()349     private void skipCountry() {
350         if (!atTerminator()) {
351             if (id[index] == UNDERSCORE || id[index] == HYPHEN) {
352                 ++index;
353             }
354             /*
355              * Save the index point after the separator, since the format
356              * requires two separators if the country is not present.
357              */
358             int oldIndex = index;
359 
360             skipUntilTerminatorOrIDSeparator();
361             int charsSkipped = index - oldIndex;
362             if (charsSkipped < 2 || charsSkipped > 3) {
363                 index = oldIndex;
364             }
365         }
366     }
367 
368     // There are no strict limitation of the syntax of variant in the legacy
369     // locale format. If the locale is constructed from unicode_locale_id
370     // as defined in UTS35, then we know each unicode_variant_subtag
371     // could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
372     // 179 would allow 20 unicode_variant_subtag with sep in the
373     // unicode_locale_id
374     // 8*20 + 1*(20-1) = 179
375     private static final int MAX_VARIANTS_LENGTH = 179;
376 
377     /**
378      * Advance index past variant, and accumulate normalized variant in buffer.  This ignores
379      * the codepage information from POSIX ids.  Index must be immediately after the country
380      * or script.  Index is left at the keyword separator or at the end of the text.  Return
381      * the start of the variant code in the buffer.
382      *
383      * In standard form, we can have the following forms:
384      * ll__VVVV
385      * ll_CC_VVVV
386      * ll_Ssss_VVVV
387      * ll_Ssss_CC_VVVV
388      *
389      * This also handles POSIX ids, which can have the following forms (pppp is code page id):
390      * ll_CC.pppp          --> ll_CC
391      * ll_CC.pppp@VVVV     --> ll_CC_VVVV
392      * ll_CC@VVVV          --> ll_CC_VVVV
393      *
394      * We identify this use of '@' in POSIX ids by looking for an '=' following
395      * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
396      * being part of a POSIX id.
397      *
398      * Note:  since it was decided that we want an option to not handle POSIX ids, this
399      * becomes a bit more complex.
400      */
parseVariant()401     private int parseVariant() {
402         int oldBlen = buffer.length();
403 
404         boolean start = true;
405         boolean needSeparator = true;
406         boolean skipping = false;
407         char c;
408         boolean firstPass = true;
409 
410         while ((c = next()) != DONE) {
411             if (c == DOT) {
412                 start = false;
413                 skipping = true;
414             } else if (c == KEYWORD_SEPARATOR) {
415                 if (haveKeywordAssign()) {
416                     break;
417                 }
418                 skipping = false;
419                 start = false;
420                 needSeparator = true; // add another underscore if we have more text
421             } else if (start) {
422                 start = false;
423                 if (c != UNDERSCORE && c != HYPHEN) {
424                     index--;
425                 }
426             } else if (!skipping) {
427                 if (needSeparator) {
428                     needSeparator = false;
429                     if (firstPass && !hadCountry) { // no country, we'll need two
430                         addSeparator();
431                         ++oldBlen; // for sure
432                     }
433                     addSeparator();
434                     if (firstPass) { // only for the first separator
435                         ++oldBlen;
436                         firstPass = false;
437                     }
438                 }
439                 c = AsciiUtil.toUpper(c);
440                 if (c == HYPHEN || c == COMMA) {
441                     c = UNDERSCORE;
442                 }
443                 append(c);
444                 if (buffer.length() - oldBlen > MAX_VARIANTS_LENGTH) {
445                     throw new IllegalArgumentException("variants is too long");
446                 }
447             }
448         }
449         --index; // unget
450         return oldBlen;
451     }
452 
453     // no need for skipvariant, to get the keywords we'll just scan directly for
454     // the keyword separator
455 
456     /**
457      * Returns the normalized language id, or the empty string.
458      */
getLanguage()459     public String getLanguage() {
460         reset();
461         return getString(parseLanguage());
462     }
463 
464     /**
465      * Returns the normalized script id, or the empty string.
466      */
getScript()467     public String getScript() {
468         reset();
469         skipLanguage();
470         return getString(parseScript());
471     }
472 
473     /**
474      * return the normalized country id, or the empty string.
475      */
getCountry()476     public String getCountry() {
477         reset();
478         skipLanguage();
479         skipScript();
480         return getString(parseCountry());
481     }
482 
483     /**
484      * Returns the normalized variant id, or the empty string.
485      */
getVariant()486     public String getVariant() {
487         reset();
488         skipLanguage();
489         skipScript();
490         skipCountry();
491         return getString(parseVariant());
492     }
493 
494     /**
495      * Returns the language, script, country, and variant as separate strings.
496      */
getLanguageScriptCountryVariant()497     public String[] getLanguageScriptCountryVariant() {
498         reset();
499         return new String[] {
500                 getString(parseLanguage()),
501                 getString(parseScript()),
502                 getString(parseCountry()),
503                 getString(parseVariant())
504         };
505     }
506 
setBaseName(String baseName)507     public void setBaseName(String baseName) {
508         this.baseName = baseName;
509     }
510 
parseBaseName()511     public void parseBaseName() {
512         if (baseName != null) {
513             set(0, baseName);
514         } else {
515             reset();
516             parseLanguage();
517             parseScript();
518             parseCountry();
519             parseVariant();
520 
521             // catch unwanted trailing underscore after country if there was no variant
522             int len = buffer.length();
523             if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) {
524                 buffer.deleteCharAt(len - 1);
525             }
526         }
527     }
528 
529     /**
530      * Returns the normalized base form of the locale id.  The base
531      * form does not include keywords.
532      */
getBaseName()533     public String getBaseName() {
534         if (baseName != null) {
535             return baseName;
536         }
537         parseBaseName();
538         return getString(0);
539     }
540 
541     /**
542      * Returns the normalized full form of the locale id.  The full
543      * form includes keywords if they are present.
544      */
getName()545     public String getName() {
546         parseBaseName();
547         parseKeywords();
548         return getString(0);
549     }
550 
551     // keyword utilities
552 
553     /**
554      * If we have keywords, advance index to the start of the keywords and return true,
555      * otherwise return false.
556      */
setToKeywordStart()557     private boolean setToKeywordStart() {
558         for (int i = index; i < id.length; ++i) {
559             if (id[i] == KEYWORD_SEPARATOR) {
560                 if (canonicalize) {
561                     for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
562                         if (id[j] == KEYWORD_ASSIGN) {
563                             index = i;
564                             return true;
565                         }
566                     }
567                 } else {
568                     if (++i < id.length) {
569                         index = i;
570                         return true;
571                     }
572                 }
573                 break;
574             }
575         }
576         return false;
577     }
578 
isDoneOrKeywordAssign(char c)579     private static boolean isDoneOrKeywordAssign(char c) {
580         return c == DONE || c == KEYWORD_ASSIGN;
581     }
582 
isDoneOrItemSeparator(char c)583     private static boolean isDoneOrItemSeparator(char c) {
584         return c == DONE || c == ITEM_SEPARATOR;
585     }
586 
getKeyword()587     private String getKeyword() {
588         int start = index;
589         while (!isDoneOrKeywordAssign(next())) {
590         }
591         --index;
592         return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
593     }
594 
getValue()595     private String getValue() {
596         int start = index;
597         while (!isDoneOrItemSeparator(next())) {
598         }
599         --index;
600         return new String(id, start, index-start).trim(); // leave case alone
601     }
602 
getKeyComparator()603     private Comparator<String> getKeyComparator() {
604         final Comparator<String> comp = new Comparator<String>() {
605             @Override
606             public int compare(String lhs, String rhs) {
607                 return lhs.compareTo(rhs);
608             }
609         };
610         return comp;
611     }
612 
613     /**
614      * Returns a map of the keywords and values, or null if there are none.
615      */
getKeywordMap()616     public Map<String, String> getKeywordMap() {
617         if (keywords == null) {
618             TreeMap<String, String> m = null;
619             if (setToKeywordStart()) {
620                 // trim spaces and convert to lower case, both keywords and values.
621                 do {
622                     String key = getKeyword();
623                     if (key.length() == 0) {
624                         break;
625                     }
626                     char c = next();
627                     if (c != KEYWORD_ASSIGN) {
628                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
629                         if (c == DONE) {
630                             break;
631                         } else {
632                             continue;
633                         }
634                     }
635                     String value = getValue();
636                     if (value.length() == 0) {
637                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
638                         continue;
639                     }
640                     if (m == null) {
641                         m = new TreeMap<String, String>(getKeyComparator());
642                     } else if (m.containsKey(key)) {
643                         // throw new IllegalArgumentException("key '" + key + "' already has a value.");
644                         continue;
645                     }
646                     m.put(key, value);
647                 } while (next() == ITEM_SEPARATOR);
648             }
649             keywords = m != null ? m : Collections.<String, String>emptyMap();
650         }
651 
652         return keywords;
653     }
654 
655 
656     /**
657      * Parse the keywords and return start of the string in the buffer.
658      */
parseKeywords()659     private int parseKeywords() {
660         int oldBlen = buffer.length();
661         Map<String, String> m = getKeywordMap();
662         if (!m.isEmpty()) {
663             boolean first = true;
664             for (Map.Entry<String, String> e : m.entrySet()) {
665                 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
666                 first = false;
667                 append(e.getKey());
668                 append(KEYWORD_ASSIGN);
669                 append(e.getValue());
670             }
671             if (first == false) {
672                 ++oldBlen;
673             }
674         }
675         return oldBlen;
676     }
677 
678     /**
679      * Returns an iterator over the keywords, or null if we have an empty map.
680      */
getKeywords()681     public Iterator<String> getKeywords() {
682         Map<String, String> m = getKeywordMap();
683         return m.isEmpty() ? null : m.keySet().iterator();
684     }
685 
686     /**
687      * Returns the value for the named keyword, or null if the keyword is not
688      * present.
689      */
getKeywordValue(String keywordName)690     public String getKeywordValue(String keywordName) {
691         Map<String, String> m = getKeywordMap();
692         return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
693     }
694 
695     /**
696      * Set the keyword value only if it is not already set to something else.
697      */
defaultKeywordValue(String keywordName, String value)698     public void defaultKeywordValue(String keywordName, String value) {
699         setKeywordValue(keywordName, value, false);
700     }
701 
702     /**
703      * Set the value for the named keyword, or unset it if value is null.  If
704      * keywordName itself is null, unset all keywords.  If keywordName is not null,
705      * value must not be null.
706      */
setKeywordValue(String keywordName, String value)707     public void setKeywordValue(String keywordName, String value) {
708         setKeywordValue(keywordName, value, true);
709     }
710 
711     /**
712      * Set the value for the named keyword, or unset it if value is null.  If
713      * keywordName itself is null, unset all keywords.  If keywordName is not null,
714      * value must not be null.  If reset is true, ignore any previous value for
715      * the keyword, otherwise do not change the keyword (including removal of
716      * one or all keywords).
717      */
setKeywordValue(String keywordName, String value, boolean reset)718     private void setKeywordValue(String keywordName, String value, boolean reset) {
719         if (keywordName == null) {
720             if (reset) {
721                 // force new map, ignore value
722                 keywords = Collections.<String, String>emptyMap();
723             }
724         } else {
725             keywordName = AsciiUtil.toLowerString(keywordName.trim());
726             if (keywordName.length() == 0) {
727                 throw new IllegalArgumentException("keyword must not be empty");
728             }
729             if (value != null) {
730                 value = value.trim();
731                 if (value.length() == 0) {
732                     throw new IllegalArgumentException("value must not be empty");
733                 }
734             }
735             Map<String, String> m = getKeywordMap();
736             if (m.isEmpty()) { // it is EMPTY_MAP
737                 if (value != null) {
738                     // force new map
739                     keywords = new TreeMap<String, String>(getKeyComparator());
740                     keywords.put(keywordName, value.trim());
741                 }
742             } else {
743                 if (reset || !m.containsKey(keywordName)) {
744                     if (value != null) {
745                         m.put(keywordName, value);
746                     } else {
747                         m.remove(keywordName);
748                         if (m.isEmpty()) {
749                             // force new map
750                             keywords = Collections.<String, String>emptyMap();
751                         }
752                     }
753                 }
754             }
755         }
756     }
757 }
758