• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 2003-2011, International Business Machines Corporation and   *
6 * others. All Rights Reserved.                                               *
7 ******************************************************************************
8 */
9 
10 package com.ibm.icu.impl;
11 
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.Iterator;
15 import java.util.Map;
16 import java.util.TreeMap;
17 
18 import com.ibm.icu.impl.locale.AsciiUtil;
19 
20 /**
21  * Utility class to parse and normalize locale ids (including POSIX style)
22  */
23 public final class LocaleIDParser {
24 
25     /**
26      * Char array representing the locale ID.
27      */
28     private char[] id;
29 
30     /**
31      * Current position in {@link #id} (while parsing).
32      */
33     private int index;
34 
35     /**
36      * Temporary buffer for parsed sections of data.
37      */
38     private StringBuilder buffer;
39 
40     // um, don't handle POSIX ids unless we request it.  why not?  well... because.
41     private boolean canonicalize;
42     private boolean hadCountry;
43 
44     // used when canonicalizing
45     Map<String, String> keywords;
46     String baseName;
47 
48     /**
49      * Parsing constants.
50      */
51     private static final char KEYWORD_SEPARATOR     = '@';
52     private static final char HYPHEN                = '-';
53     private static final char KEYWORD_ASSIGN        = '=';
54     private static final char COMMA                 = ',';
55     private static final char ITEM_SEPARATOR        = ';';
56     private static final char DOT                   = '.';
57     private static final char UNDERSCORE            = '_';
58 
LocaleIDParser(String localeID)59     public LocaleIDParser(String localeID) {
60         this(localeID, false);
61     }
62 
LocaleIDParser(String localeID, boolean canonicalize)63     public LocaleIDParser(String localeID, boolean canonicalize) {
64         id = localeID.toCharArray();
65         index = 0;
66         buffer = new StringBuilder(id.length + 5);
67         this.canonicalize = canonicalize;
68     }
69 
reset()70     private void reset() {
71         index = 0;
72         buffer = new StringBuilder(id.length + 5);
73     }
74 
75     // utilities for working on text in the buffer
76 
77     /**
78      * Append c to the buffer.
79      */
append(char c)80     private void append(char c) {
81         buffer.append(c);
82     }
83 
addSeparator()84     private void addSeparator() {
85         append(UNDERSCORE);
86     }
87 
88     /**
89      * Returns the text in the buffer from start to blen as a String.
90      */
getString(int start)91     private String getString(int start) {
92         return buffer.substring(start);
93     }
94 
95     /**
96      * Set the length of the buffer to pos, then append the string.
97      */
set(int pos, String s)98     private void set(int pos, String s) {
99         buffer.delete(pos, buffer.length());
100         buffer.insert(pos, s);
101     }
102 
103     /**
104      * Append the string to the buffer.
105      */
append(String s)106     private void append(String s) {
107         buffer.append(s);
108     }
109 
110     // utilities for parsing text out of the id
111 
112     /**
113      * Character to indicate no more text is available in the id.
114      */
115     private static final char DONE = '\uffff';
116 
117     /**
118      * Returns the character at index in the id, and advance index.  The returned character
119      * is DONE if index was at the limit of the buffer.  The index is advanced regardless
120      * so that decrementing the index will always 'unget' the last character returned.
121      */
next()122     private char next() {
123         if (index == id.length) {
124             index++;
125             return DONE;
126         }
127 
128         return id[index++];
129     }
130 
131     /**
132      * Advance index until the next terminator or id separator, and leave it there.
133      */
skipUntilTerminatorOrIDSeparator()134     private void skipUntilTerminatorOrIDSeparator() {
135         while (!isTerminatorOrIDSeparator(next()));
136         --index;
137     }
138 
139     /**
140      * Returns true if the character at index in the id is a terminator.
141      */
atTerminator()142     private boolean atTerminator() {
143         return index >= id.length || isTerminator(id[index]);
144     }
145 
146     /**
147      * Returns true if the character is a terminator (keyword separator, dot, or DONE).
148      * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
149      */
isTerminator(char c)150     private boolean isTerminator(char c) {
151         // always terminate at DOT, even if not handling POSIX.  It's an error...
152         return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
153     }
154 
155     /**
156      * Returns true if the character is a terminator or id separator.
157      */
isTerminatorOrIDSeparator(char c)158     private boolean isTerminatorOrIDSeparator(char c) {
159         return c == UNDERSCORE || c == HYPHEN || isTerminator(c);
160     }
161 
162     /**
163      * Returns true if the start of the buffer has an experimental or private language
164      * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
165      */
haveExperimentalLanguagePrefix()166     private boolean haveExperimentalLanguagePrefix() {
167         if (id.length > 2) {
168             char c = id[1];
169             if (c == HYPHEN || c == UNDERSCORE) {
170                 c = id[0];
171                 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
172             }
173         }
174         return false;
175     }
176 
177     /**
178      * Returns true if a value separator occurs at or after index.
179      */
haveKeywordAssign()180     private boolean haveKeywordAssign() {
181         // assume it is safe to start from index
182         for (int i = index; i < id.length; ++i) {
183             if (id[i] == KEYWORD_ASSIGN) {
184                 return true;
185             }
186         }
187         return false;
188     }
189 
190     /**
191      * Advance index past language, and accumulate normalized language code in buffer.
192      * Index must be at 0 when this is called.  Index is left at a terminator or id
193      * separator.  Returns the start of the language code in the buffer.
194      */
parseLanguage()195     private int parseLanguage() {
196         int startLength = buffer.length();
197 
198         if (haveExperimentalLanguagePrefix()) {
199             append(AsciiUtil.toLower(id[0]));
200             append(HYPHEN);
201             index = 2;
202         }
203 
204         char c;
205         while(!isTerminatorOrIDSeparator(c = next())) {
206             append(AsciiUtil.toLower(c));
207         }
208         --index; // unget
209 
210         if (buffer.length() - startLength == 3) {
211             String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
212             if (lang != null) {
213                 set(0, lang);
214             }
215         }
216 
217         return 0;
218     }
219 
220     /**
221      * Advance index past language.  Index must be at 0 when this is called.  Index
222      * is left at a terminator or id separator.
223      */
skipLanguage()224     private void skipLanguage() {
225         if (haveExperimentalLanguagePrefix()) {
226             index = 2;
227         }
228         skipUntilTerminatorOrIDSeparator();
229     }
230 
231     /**
232      * Advance index past script, and accumulate normalized script in buffer.
233      * Index must be immediately after the language.
234      * If the item at this position is not a script (is not four characters
235      * long) leave index and buffer unchanged.  Otherwise index is left at
236      * a terminator or id separator.  Returns the start of the script code
237      * in the buffer (this may be equal to the buffer length, if there is no
238      * script).
239      */
parseScript()240     private int parseScript() {
241         if (!atTerminator()) {
242             int oldIndex = index; // save original index
243             ++index;
244 
245             int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone
246             char c;
247             boolean firstPass = true;
248             while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) {
249                 if (firstPass) {
250                     addSeparator();
251                     append(AsciiUtil.toUpper(c));
252                     firstPass = false;
253                 } else {
254                     append(AsciiUtil.toLower(c));
255                 }
256             }
257             --index; // unget
258 
259             /* If it's not exactly 4 characters long, then it's not a script. */
260             if (index - oldIndex != 5) { // +1 to account for separator
261                 index = oldIndex;
262                 buffer.delete(oldBlen, buffer.length());
263             } else {
264                 oldBlen++; // index past hyphen, for clients who want to extract just the script
265             }
266 
267             return oldBlen;
268         }
269         return buffer.length();
270     }
271 
272     /**
273      * Advance index past script.
274      * Index must be immediately after the language and IDSeparator.
275      * If the item at this position is not a script (is not four characters
276      * long) leave index.  Otherwise index is left at a terminator or
277      * id separator.
278      */
skipScript()279     private void skipScript() {
280         if (!atTerminator()) {
281             int oldIndex = index;
282             ++index;
283 
284             char c;
285             while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c));
286             --index;
287 
288             if (index - oldIndex != 5) { // +1 to account for separator
289                 index = oldIndex;
290             }
291         }
292     }
293 
294     /**
295      * Advance index past country, and accumulate normalized country in buffer.
296      * Index must be immediately after the script (if there is one, else language)
297      * and IDSeparator.  Return the start of the country code in the buffer.
298      */
parseCountry()299     private int parseCountry() {
300         if (!atTerminator()) {
301             int oldIndex = index;
302             ++index;
303 
304             int oldBlen = buffer.length();
305             char c;
306             boolean firstPass = true;
307             while (!isTerminatorOrIDSeparator(c = next())) {
308                 if (firstPass) { // first, add hyphen
309                     hadCountry = true; // we have a country, let variant parsing know
310                     addSeparator();
311                     ++oldBlen; // increment past hyphen
312                     firstPass = false;
313                 }
314                 append(AsciiUtil.toUpper(c));
315             }
316             --index; // unget
317 
318             int charsAppended = buffer.length() - oldBlen;
319 
320             if (charsAppended == 0) {
321                 // Do nothing.
322             }
323             else if (charsAppended < 2 || charsAppended > 3) {
324                 // It's not a country, so return index and blen to
325                 // their previous values.
326                 index = oldIndex;
327                 --oldBlen;
328                 buffer.delete(oldBlen, buffer.length());
329                 hadCountry = false;
330             }
331             else if (charsAppended == 3) {
332                 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
333                 if (region != null) {
334                     set(oldBlen, region);
335                 }
336             }
337 
338             return oldBlen;
339         }
340 
341         return buffer.length();
342     }
343 
344     /**
345      * Advance index past country.
346      * Index must be immediately after the script (if there is one, else language)
347      * and IDSeparator.
348      */
skipCountry()349     private void skipCountry() {
350         if (!atTerminator()) {
351             if (id[index] == UNDERSCORE || id[index] == HYPHEN) {
352                 ++index;
353             }
354             /*
355              * Save the index point after the separator, since the format
356              * requires two separators if the country is not present.
357              */
358             int oldIndex = index;
359 
360             skipUntilTerminatorOrIDSeparator();
361             int charsSkipped = index - oldIndex;
362             if (charsSkipped < 2 || charsSkipped > 3) {
363                 index = oldIndex;
364             }
365         }
366     }
367 
368     /**
369      * Advance index past variant, and accumulate normalized variant in buffer.  This ignores
370      * the codepage information from POSIX ids.  Index must be immediately after the country
371      * or script.  Index is left at the keyword separator or at the end of the text.  Return
372      * the start of the variant code in the buffer.
373      *
374      * In standard form, we can have the following forms:
375      * ll__VVVV
376      * ll_CC_VVVV
377      * ll_Ssss_VVVV
378      * ll_Ssss_CC_VVVV
379      *
380      * This also handles POSIX ids, which can have the following forms (pppp is code page id):
381      * ll_CC.pppp          --> ll_CC
382      * ll_CC.pppp@VVVV     --> ll_CC_VVVV
383      * ll_CC@VVVV          --> ll_CC_VVVV
384      *
385      * We identify this use of '@' in POSIX ids by looking for an '=' following
386      * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
387      * being part of a POSIX id.
388      *
389      * Note:  since it was decided that we want an option to not handle POSIX ids, this
390      * becomes a bit more complex.
391      */
parseVariant()392     private int parseVariant() {
393         int oldBlen = buffer.length();
394 
395         boolean start = true;
396         boolean needSeparator = true;
397         boolean skipping = false;
398         char c;
399         boolean firstPass = true;
400 
401         while ((c = next()) != DONE) {
402             if (c == DOT) {
403                 start = false;
404                 skipping = true;
405             } else if (c == KEYWORD_SEPARATOR) {
406                 if (haveKeywordAssign()) {
407                     break;
408                 }
409                 skipping = false;
410                 start = false;
411                 needSeparator = true; // add another underscore if we have more text
412             } else if (start) {
413                 start = false;
414                 if (c != UNDERSCORE && c != HYPHEN) {
415                     index--;
416                 }
417             } else if (!skipping) {
418                 if (needSeparator) {
419                     needSeparator = false;
420                     if (firstPass && !hadCountry) { // no country, we'll need two
421                         addSeparator();
422                         ++oldBlen; // for sure
423                     }
424                     addSeparator();
425                     if (firstPass) { // only for the first separator
426                         ++oldBlen;
427                         firstPass = false;
428                     }
429                 }
430                 c = AsciiUtil.toUpper(c);
431                 if (c == HYPHEN || c == COMMA) {
432                     c = UNDERSCORE;
433                 }
434                 append(c);
435             }
436         }
437         --index; // unget
438 
439         return oldBlen;
440     }
441 
442     // no need for skipvariant, to get the keywords we'll just scan directly for
443     // the keyword separator
444 
445     /**
446      * Returns the normalized language id, or the empty string.
447      */
getLanguage()448     public String getLanguage() {
449         reset();
450         return getString(parseLanguage());
451     }
452 
453     /**
454      * Returns the normalized script id, or the empty string.
455      */
getScript()456     public String getScript() {
457         reset();
458         skipLanguage();
459         return getString(parseScript());
460     }
461 
462     /**
463      * return the normalized country id, or the empty string.
464      */
getCountry()465     public String getCountry() {
466         reset();
467         skipLanguage();
468         skipScript();
469         return getString(parseCountry());
470     }
471 
472     /**
473      * Returns the normalized variant id, or the empty string.
474      */
getVariant()475     public String getVariant() {
476         reset();
477         skipLanguage();
478         skipScript();
479         skipCountry();
480         return getString(parseVariant());
481     }
482 
483     /**
484      * Returns the language, script, country, and variant as separate strings.
485      */
getLanguageScriptCountryVariant()486     public String[] getLanguageScriptCountryVariant() {
487         reset();
488         return new String[] {
489                 getString(parseLanguage()),
490                 getString(parseScript()),
491                 getString(parseCountry()),
492                 getString(parseVariant())
493         };
494     }
495 
setBaseName(String baseName)496     public void setBaseName(String baseName) {
497         this.baseName = baseName;
498     }
499 
parseBaseName()500     public void parseBaseName() {
501         if (baseName != null) {
502             set(0, baseName);
503         } else {
504             reset();
505             parseLanguage();
506             parseScript();
507             parseCountry();
508             parseVariant();
509 
510             // catch unwanted trailing underscore after country if there was no variant
511             int len = buffer.length();
512             if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) {
513                 buffer.deleteCharAt(len - 1);
514             }
515         }
516     }
517 
518     /**
519      * Returns the normalized base form of the locale id.  The base
520      * form does not include keywords.
521      */
getBaseName()522     public String getBaseName() {
523         if (baseName != null) {
524             return baseName;
525         }
526         parseBaseName();
527         return getString(0);
528     }
529 
530     /**
531      * Returns the normalized full form of the locale id.  The full
532      * form includes keywords if they are present.
533      */
getName()534     public String getName() {
535         parseBaseName();
536         parseKeywords();
537         return getString(0);
538     }
539 
540     // keyword utilities
541 
542     /**
543      * If we have keywords, advance index to the start of the keywords and return true,
544      * otherwise return false.
545      */
setToKeywordStart()546     private boolean setToKeywordStart() {
547         for (int i = index; i < id.length; ++i) {
548             if (id[i] == KEYWORD_SEPARATOR) {
549                 if (canonicalize) {
550                     for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
551                         if (id[j] == KEYWORD_ASSIGN) {
552                             index = i;
553                             return true;
554                         }
555                     }
556                 } else {
557                     if (++i < id.length) {
558                         index = i;
559                         return true;
560                     }
561                 }
562                 break;
563             }
564         }
565         return false;
566     }
567 
isDoneOrKeywordAssign(char c)568     private static boolean isDoneOrKeywordAssign(char c) {
569         return c == DONE || c == KEYWORD_ASSIGN;
570     }
571 
isDoneOrItemSeparator(char c)572     private static boolean isDoneOrItemSeparator(char c) {
573         return c == DONE || c == ITEM_SEPARATOR;
574     }
575 
getKeyword()576     private String getKeyword() {
577         int start = index;
578         while (!isDoneOrKeywordAssign(next())) {
579         }
580         --index;
581         return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
582     }
583 
getValue()584     private String getValue() {
585         int start = index;
586         while (!isDoneOrItemSeparator(next())) {
587         }
588         --index;
589         return new String(id, start, index-start).trim(); // leave case alone
590     }
591 
getKeyComparator()592     private Comparator<String> getKeyComparator() {
593         final Comparator<String> comp = new Comparator<String>() {
594             @Override
595             public int compare(String lhs, String rhs) {
596                 return lhs.compareTo(rhs);
597             }
598         };
599         return comp;
600     }
601 
602     /**
603      * Returns a map of the keywords and values, or null if there are none.
604      */
getKeywordMap()605     public Map<String, String> getKeywordMap() {
606         if (keywords == null) {
607             TreeMap<String, String> m = null;
608             if (setToKeywordStart()) {
609                 // trim spaces and convert to lower case, both keywords and values.
610                 do {
611                     String key = getKeyword();
612                     if (key.length() == 0) {
613                         break;
614                     }
615                     char c = next();
616                     if (c != KEYWORD_ASSIGN) {
617                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
618                         if (c == DONE) {
619                             break;
620                         } else {
621                             continue;
622                         }
623                     }
624                     String value = getValue();
625                     if (value.length() == 0) {
626                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
627                         continue;
628                     }
629                     if (m == null) {
630                         m = new TreeMap<String, String>(getKeyComparator());
631                     } else if (m.containsKey(key)) {
632                         // throw new IllegalArgumentException("key '" + key + "' already has a value.");
633                         continue;
634                     }
635                     m.put(key, value);
636                 } while (next() == ITEM_SEPARATOR);
637             }
638             keywords = m != null ? m : Collections.<String, String>emptyMap();
639         }
640 
641         return keywords;
642     }
643 
644 
645     /**
646      * Parse the keywords and return start of the string in the buffer.
647      */
parseKeywords()648     private int parseKeywords() {
649         int oldBlen = buffer.length();
650         Map<String, String> m = getKeywordMap();
651         if (!m.isEmpty()) {
652             boolean first = true;
653             for (Map.Entry<String, String> e : m.entrySet()) {
654                 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
655                 first = false;
656                 append(e.getKey());
657                 append(KEYWORD_ASSIGN);
658                 append(e.getValue());
659             }
660             if (first == false) {
661                 ++oldBlen;
662             }
663         }
664         return oldBlen;
665     }
666 
667     /**
668      * Returns an iterator over the keywords, or null if we have an empty map.
669      */
getKeywords()670     public Iterator<String> getKeywords() {
671         Map<String, String> m = getKeywordMap();
672         return m.isEmpty() ? null : m.keySet().iterator();
673     }
674 
675     /**
676      * Returns the value for the named keyword, or null if the keyword is not
677      * present.
678      */
getKeywordValue(String keywordName)679     public String getKeywordValue(String keywordName) {
680         Map<String, String> m = getKeywordMap();
681         return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
682     }
683 
684     /**
685      * Set the keyword value only if it is not already set to something else.
686      */
defaultKeywordValue(String keywordName, String value)687     public void defaultKeywordValue(String keywordName, String value) {
688         setKeywordValue(keywordName, value, false);
689     }
690 
691     /**
692      * Set the value for the named keyword, or unset it if value is null.  If
693      * keywordName itself is null, unset all keywords.  If keywordName is not null,
694      * value must not be null.
695      */
setKeywordValue(String keywordName, String value)696     public void setKeywordValue(String keywordName, String value) {
697         setKeywordValue(keywordName, value, true);
698     }
699 
700     /**
701      * Set the value for the named keyword, or unset it if value is null.  If
702      * keywordName itself is null, unset all keywords.  If keywordName is not null,
703      * value must not be null.  If reset is true, ignore any previous value for
704      * the keyword, otherwise do not change the keyword (including removal of
705      * one or all keywords).
706      */
setKeywordValue(String keywordName, String value, boolean reset)707     private void setKeywordValue(String keywordName, String value, boolean reset) {
708         if (keywordName == null) {
709             if (reset) {
710                 // force new map, ignore value
711                 keywords = Collections.<String, String>emptyMap();
712             }
713         } else {
714             keywordName = AsciiUtil.toLowerString(keywordName.trim());
715             if (keywordName.length() == 0) {
716                 throw new IllegalArgumentException("keyword must not be empty");
717             }
718             if (value != null) {
719                 value = value.trim();
720                 if (value.length() == 0) {
721                     throw new IllegalArgumentException("value must not be empty");
722                 }
723             }
724             Map<String, String> m = getKeywordMap();
725             if (m.isEmpty()) { // it is EMPTY_MAP
726                 if (value != null) {
727                     // force new map
728                     keywords = new TreeMap<String, String>(getKeyComparator());
729                     keywords.put(keywordName, value.trim());
730                 }
731             } else {
732                 if (reset || !m.containsKey(keywordName)) {
733                     if (value != null) {
734                         m.put(keywordName, value);
735                     } else {
736                         m.remove(keywordName);
737                         if (m.isEmpty()) {
738                             // force new map
739                             keywords = Collections.<String, String>emptyMap();
740                         }
741                     }
742                 }
743             }
744         }
745     }
746 }
747