• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 1997-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *
7 * File ULOC.CPP
8 *
9 * Modification History:
10 *
11 *   Date        Name        Description
12 *   04/01/97    aliu        Creation.
13 *   08/21/98    stephen     JDK 1.2 sync
14 *   12/08/98    rtg         New Locale implementation and C API
15 *   03/15/99    damiba      overhaul.
16 *   04/06/99    stephen     changed setDefault() to realloc and copy
17 *   06/14/99    stephen     Changed calls to ures_open for new params
18 *   07/21/99    stephen     Modified setDefault() to propagate to C++
19 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
20 *                           brought canonicalization code into line with spec
21 *****************************************************************************/
22 
23 /*
24    POSIX's locale format, from putil.c: [no spaces]
25 
26      ll [ _CC ] [ . MM ] [ @ VV]
27 
28      l = lang, C = ctry, M = charmap, V = variant
29 */
30 
31 #include "unicode/utypes.h"
32 #include "unicode/ustring.h"
33 #include "unicode/uloc.h"
34 
35 #include "putilimp.h"
36 #include "ustr_imp.h"
37 #include "ulocimp.h"
38 #include "umutex.h"
39 #include "cstring.h"
40 #include "cmemory.h"
41 #include "ucln_cmn.h"
42 #include "locmap.h"
43 #include "uarrsort.h"
44 #include "uenumimp.h"
45 #include "uassert.h"
46 
47 #include <stdio.h> /* for sprintf */
48 
49 /* ### Declarations **************************************************/
50 
51 /* Locale stuff from locid.cpp */
52 U_CFUNC void locale_set_default(const char *id);
53 U_CFUNC const char *locale_get_default(void);
54 U_CFUNC int32_t
55 locale_getKeywords(const char *localeID,
56             char prev,
57             char *keywords, int32_t keywordCapacity,
58             char *values, int32_t valuesCapacity, int32_t *valLen,
59             UBool valuesToo,
60             UErrorCode *status);
61 
62 /* ### Data tables **************************************************/
63 
64 /**
65  * Table of language codes, both 2- and 3-letter, with preference
66  * given to 2-letter codes where possible.  Includes 3-letter codes
67  * that lack a 2-letter equivalent.
68  *
69  * This list must be in sorted order.  This list is returned directly
70  * to the user by some API.
71  *
72  * This list must be kept in sync with LANGUAGES_3, with corresponding
73  * entries matched.
74  *
75  * This table should be terminated with a NULL entry, followed by a
76  * second list, and another NULL entry.  The first list is visible to
77  * user code when this array is returned by API.  The second list
78  * contains codes we support, but do not expose through user API.
79  *
80  * Notes
81  *
82  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
83  * include the revisions up to 2001/7/27 *CWB*
84  *
85  * The 3 character codes are the terminology codes like RFC 3066.  This
86  * is compatible with prior ICU codes
87  *
88  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
89  * table but now at the end of the table because 3 character codes are
90  * duplicates.  This avoids bad searches going from 3 to 2 character
91  * codes.
92  *
93  * The range qaa-qtz is reserved for local use
94  */
95 static const char * const LANGUAGES[] = {
96     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",
97     "afh", "agq", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",
98     "ang", "anp", "apa",
99     "ar",  "arc", "arn", "arp", "art", "arw", "as", "asa", "ast",
100     "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",
101     "bai", "bal", "ban", "bas", "bat", "be",  "bej",
102     "bem", "ber", "bez", "bg",  "bh",  "bho", "bi",  "bik", "bin",
103     "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "brx", "bs",
104     "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",
105     "cch", "ce",  "ceb", "cel", "cgg", "ch",  "chb", "chg", "chk", "chm",
106     "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",
107     "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",
108     "cv",  "cy",  "da",  "dak", "dar", "dav", "day", "de",  "del", "den",
109     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyo", "dyu",
110     "dz",  "ebu", "ee",  "efi", "egy", "eka", "el",  "elx", "en",
111     "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",
112     "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",
113     "fr",  "frm", "fro", "frr", "frs", "fur", "fy",
114     "ga",  "gaa", "gay", "gba", "gd",  "gem", "gez", "gil",
115     "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
116     "grc", "gsw", "gu",  "guz", "gv",  "gwi",
117     "ha",  "hai", "haw", "he",  "hi",  "hil", "him",
118     "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",
119     "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",
120     "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",
121     "iu",  "ja",  "jbo", "jmc", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",
122     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg",  "kha", "khi",
123     "kho", "khq", "ki",  "kj",  "kk",  "kl",  "kln", "km",  "kmb", "kn",
124     "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks", "ksb", "ksf",
125     "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad", "lag",
126     "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",
127     "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus", "luy",
128     "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",
129     "mdf", "mdr", "men", "mer", "mfe", "mg",  "mga", "mgh", "mh",  "mi",  "mic", "min",
130     "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",
131     "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mua", "mul", "mun",
132     "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap", "naq",
133     "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",
134     "niu", "nl",  "nmg", "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub", "nus",
135     "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",
136     "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",
137     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
138     "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",
139     "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rof", "rom",
140     "ru",  "rup", "rw",  "rwk", "sa",  "sad", "sah", "sai", "sal", "sam", "saq",
141     "sas", "sat", "sbp", "sc",  "scn", "sco", "sd",  "se",  "seh", "sel", "sem", "ses",
142     "sg",  "sga", "sgn", "shi", "shn", "si",  "sid", "sio", "sit",
143     "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
144     "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
145     "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",
146     "sv",  "sw",  "swc", "syc", "syr", "ta",  "tai", "te",  "tem", "teo", "ter",
147     "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",
148     "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",
149     "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw", "twq",
150     "ty",  "tyv", "tzm", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",
151     "uz",  "vai", "ve",  "vi",  "vo",  "vot", "vun", "wa",  "wak",
152     "wal", "war", "was", "wen", "wo",  "xal", "xh",  "xog", "yao", "yap", "yav",
153     "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",
154     "zu",  "zun", "zxx", "zza",
155 NULL,
156     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
157 NULL
158 };
159 static const char* const DEPRECATED_LANGUAGES[]={
160     "in", "iw", "ji", "jw", NULL, NULL
161 };
162 static const char* const REPLACEMENT_LANGUAGES[]={
163     "id", "he", "yi", "jv", NULL, NULL
164 };
165 
166 /**
167  * Table of 3-letter language codes.
168  *
169  * This is a lookup table used to convert 3-letter language codes to
170  * their 2-letter equivalent, where possible.  It must be kept in sync
171  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
172  * same language as LANGUAGES_3[i].  The commented-out lines are
173  * copied from LANGUAGES to make eyeballing this baby easier.
174  *
175  * Where a 3-letter language code has no 2-letter equivalent, the
176  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
177  *
178  * This table should be terminated with a NULL entry, followed by a
179  * second list, and another NULL entry.  The two lists correspond to
180  * the two lists in LANGUAGES.
181  */
182 static const char * const LANGUAGES_3[] = {
183 /*  "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",    */
184     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr", "afa",
185 /*  "afh", "agq", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",  "ang", "anp", "apa",    */
186     "afh", "agq", "ain", "aka", "akk", "ale", "alg", "alt", "amh", "arg", "ang", "anp", "apa",
187 /*  "ar",  "arc", "arn", "arp", "art", "arw", "as",  "asa", "ast",    */
188     "ara", "arc", "arn", "arp", "art", "arw", "asm", "asa", "ast",
189 /*  "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",    */
190     "ath", "aus", "ava", "awa", "aym", "aze", "bak", "bad",
191 /*  "bai", "bal", "ban", "bas", "bat", "be",  "bej",    */
192     "bai", "bal", "ban", "bas", "bat", "bel", "bej",
193 /*  "bem", "ber", "bez", "bg",  "bh",  "bho", "bi",  "bik", "bin",    */
194     "bem", "ber", "bez", "bul", "bih", "bho", "bis", "bik", "bin",
195 /*  "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "brx", "bs",     */
196     "bla", "bam", "ben", "bnt", "bod", "bre", "bra", "brx", "bos",
197 /*  "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",    */
198     "btk", "bua", "bug", "byn", "cat", "cad", "cai", "car", "cau",
199 /*  "cch", "ce",  "ceb", "cel", "cgg", "ch",  "chb", "chg", "chk", "chm",    */
200     "cch", "che", "ceb", "cel", "cgg", "cha", "chb", "chg", "chk", "chm",
201 /*  "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",    */
202     "chn", "cho", "chp", "chr", "chy", "cmc", "cos", "cop",
203 /*  "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",    */
204     "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces", "csb", "chu", "cus",
205 /*  "cv",  "cy",  "da",  "dak", "dar", "dav", "day", "de",  "del", "den",    */
206     "chv", "cym", "dan", "dak", "dar", "dav", "day", "deu", "del", "den",
207 /*  "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyo", "dyu",    */
208     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "div", "dyo", "dyu",
209 /*  "dz",  "ebu", "ee",  "efi", "egy", "eka", "el",  "elx", "en",     */
210     "dzo", "ebu", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
211 /*  "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",     */
212     "enm", "epo", "spa", "est", "eus", "ewo", "fas",
213 /*  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",    */
214     "fan", "fat", "ful", "fin", "fil", "fiu", "fij", "fao", "fon",
215 /*  "fr",  "frm", "fro", "frr", "frs", "fur", "fy",  "ga",  "gaa", "gay",    */
216     "fra", "frm", "fro", "frr", "frs", "fur", "fry", "gle", "gaa", "gay",
217 /*  "gba", "gd",  "gem", "gez", "gil", "gl",  "gmh", "gn",     */
218     "gba", "gla", "gem", "gez", "gil", "glg", "gmh", "grn",
219 /*  "goh", "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guz", "gv",     */
220     "goh", "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guz", "glv",
221 /*  "gwi", "ha",  "hai", "haw", "he",  "hi",  "hil", "him",    */
222     "gwi", "hau", "hai", "haw", "heb", "hin", "hil", "him",
223 /*  "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",     */
224     "hit", "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye", "her",
225 /*  "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",     */
226     "ina", "iba", "ind", "ile", "ibo", "iii", "ijo", "ipk",
227 /*  "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",      */
228     "ilo", "inc", "ine", "inh", "ido", "ira", "iro", "isl", "ita",
229 /*  "iu",  "ja",  "jbo", "jmc", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",   */
230     "iku", "jpn", "jbo", "jmc", "jpr", "jrb", "jav", "kat", "kaa", "kab",
231 /*  "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg",  "kha", "khi",*/
232     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg",  "kha", "khi",
233 /*  "kho", "khq", "ki",  "kj",  "kk",  "kl",  "kln", "km",  "kmb", "kn",     */
234     "kho", "khq", "kik", "kua", "kaz", "kal", "kln", "khm", "kmb", "kan",
235 /*  "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",  "ksb", "ksf", */
236     "kor", "kok", "kos", "kpe", "kau", "krc", "krl", "kro", "kru", "kas", "ksb", "ksf",
237 /*  "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad", "lag",    */
238     "kur", "kum", "kut", "kom", "cor", "kir", "lat", "lad", "lag",
239 /*  "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",    */
240     "lah", "lam", "ltz", "lez", "lug", "lim", "lin", "lao", "lol",
241 /*  "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus", "luy",   */
242     "loz", "lit", "lub", "lua", "lui", "lun", "luo", "lus", "luy",
243 /*  "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",    */
244     "lav", "mad", "mag", "mai", "mak", "man", "map", "mas",
245 /*  "mdf", "mdr", "men", "mer", "mfe", "mg",  "mga", "mgh", "mh",  "mi",  "mic", "min",    */
246     "mdf", "mdr", "men", "mer", "mfe", "mlg", "mga", "mgh", "mah", "mri", "mic", "min",
247 /*  "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",    */
248     "mis", "mkd", "mkh", "mal", "mon", "mnc", "mni", "mno",
249 /*  "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mua", "mul", "mun",    */
250     "mol", "moh", "mos", "mar", "msa", "mlt", "mua", "mul", "mun",
251 /*  "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap", "naq",   */
252     "mus", "mwl", "mwr", "mya", "myn", "myv", "nau", "nah", "nai", "nap", "naq",
253 /*  "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",    */
254     "nob", "nde", "nds", "nep", "new", "ndo", "nia", "nic",
255 /*  "niu", "nl",  "nmg", "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub", "nus",   */
256     "niu", "nld", "nmg", "nno", "nor", "nog", "non", "nqo", "nbl", "nso", "nub", "nus",
257 /*  "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",     */
258     "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", "oci", "oji",
259 /*  "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",    */
260     "orm", "ori", "oss", "osa", "ota", "oto", "pan", "paa",
261 /*  "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",    */
262     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
263 /*  "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",     */
264     "pli", "pol", "pon", "pra", "pro", "pus", "por", "que",
265 /*  "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rof", "rom",    */
266     "raj", "rap", "rar", "roh", "run", "ron", "roa", "rof", "rom",
267 /*  "ru",  "rup", "rw",  "rwk", "sa",  "sad", "sah", "sai", "sal", "sam", "saq",    */
268     "rus", "rup", "kin", "rwk", "san", "sad", "sah", "sai", "sal", "sam", "saq",
269 /*  "sas", "sat", "sbp", "sc",  "scn", "sco", "sd",  "se",  "seh", "sel", "sem", "ses",    */
270     "sas", "sat", "sbp", "srd", "scn", "sco", "snd", "sme", "seh", "sel", "sem", "ses",
271 /*  "sg",  "sga", "sgn", "shi", "shn", "si",  "sid", "sio", "sit",    */
272     "sag", "sga", "sgn", "shi", "shn", "sin", "sid", "sio", "sit",
273 /*  "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",    */
274     "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
275 /*  "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",     */
276     "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
277 /*  "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",    */
278     "srn", "srr", "ssw", "ssa", "sot", "sun", "suk", "sus", "sux",
279 /*  "sv",  "sw",  "swc", "syc", "syr", "ta",  "tai", "te",  "tem", "teo", "ter",    */
280     "swe", "swa", "swc", "syc", "syr", "tam", "tai", "tel", "tem", "teo", "ter",
281 /*  "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",    */
282     "tet", "tgk", "tha", "tir", "tig", "tiv", "tuk", "tkl",
283 /*  "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",    */
284     "tgl", "tlh", "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
285 /*  "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",  "twq"   */
286     "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi", "twq",
287 /*  "ty",  "tyv", "tzm", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",     */
288     "tah", "tyv", "tzm", "udm", "uig", "uga", "ukr", "umb", "und", "urd",
289 /*  "uz",  "vai", "ve",  "vi",  "vo",  "vot", "vun", "wa",  "wak",    */
290     "uzb", "vai", "ven", "vie", "vol", "vot", "vun", "wln", "wak",
291 /*  "wal", "war", "was", "wen", "wo",  "xal", "xh",  "xog", "yao", "yap", "yav",   */
292     "wal", "war", "was", "wen", "wol", "xal", "xho", "xog", "yao", "yap", "yav",
293 /*  "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",    */
294     "yid", "yor", "ypk", "zha", "zap", "zbl", "zen", "zho", "znd",
295 /*  "zu",  "zun", "zxx", "zza",                                         */
296     "zul", "zun", "zxx", "zza",
297 NULL,
298 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
299     "ind", "heb", "yid", "jaw", "srp",
300 NULL
301 };
302 
303 /**
304  * Table of 2-letter country codes.
305  *
306  * This list must be in sorted order.  This list is returned directly
307  * to the user by some API.
308  *
309  * This list must be kept in sync with COUNTRIES_3, with corresponding
310  * entries matched.
311  *
312  * This table should be terminated with a NULL entry, followed by a
313  * second list, and another NULL entry.  The first list is visible to
314  * user code when this array is returned by API.  The second list
315  * contains codes we support, but do not expose through user API.
316  *
317  * Notes:
318  *
319  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
320  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
321  * new codes keeping the old ones for compatibility updated to include
322  * 1999/12/03 revisions *CWB*
323  *
324  * RO(ROM) is now RO(ROU) according to
325  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
326  */
327 static const char * const COUNTRIES[] = {
328     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",
329     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
330     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
331     "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",
332     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
333     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
334     "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
335     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
336     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
337     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
338     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
339     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
340     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
341     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
342     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
343     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
344     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
345     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
346     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
347     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
348     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
349     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
350     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
351     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
352     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",
353     "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
354     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
355     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
356     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
357     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
358 NULL,
359     "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   /* obsolete country codes */
360 NULL
361 };
362 
363 static const char* const DEPRECATED_COUNTRIES[] ={
364     "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR", NULL, NULL /* deprecated country list */
365 };
366 static const char* const REPLACEMENT_COUNTRIES[] = {
367 /*  "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR" */
368     "MM", "RS", "BJ", "FR", "BF", "VU", "ZW", "TL", "RS", "CD", NULL, NULL  /* replacement country codes */
369 };
370 
371 /**
372  * Table of 3-letter country codes.
373  *
374  * This is a lookup table used to convert 3-letter country codes to
375  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
376  * For all valid i, COUNTRIES[i] must refer to the same country as
377  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
378  * to make eyeballing this baby easier.
379  *
380  * This table should be terminated with a NULL entry, followed by a
381  * second list, and another NULL entry.  The two lists correspond to
382  * the two lists in COUNTRIES.
383  */
384 static const char * const COUNTRIES_3[] = {
385 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",     */
386     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", "ANT",
387 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
388     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
389 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
390     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
391 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",     */
392     "BEN", "BLM", "BMU", "BRN", "BOL", "BRA", "BHS", "BTN", "BVT",
393 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
394     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
395 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
396     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
397 /*  "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
398     "CUB", "CPV", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
399 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
400     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
401 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
402     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
403 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
404     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
405 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
406     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
407 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
408     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
409 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
410     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
411 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
412     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
413 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
414     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
415 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
416     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
417 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
418     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
419 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
420     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
421 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
422     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
423 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
424     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
425 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
426     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
427 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
428     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
429 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
430     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
431 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
432     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
433 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",     */
434     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "STP", "SLV",
435 /*  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
436     "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
437 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
438     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
439 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
440     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
441 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
442     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
443 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
444     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
445 NULL,
446 /*  "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   */
447     "FXX", "SCG", "ROM", "TMP", "YUG", "ZAR",
448 NULL
449 };
450 
451 typedef struct CanonicalizationMap {
452     const char *id;          /* input ID */
453     const char *canonicalID; /* canonicalized output ID */
454     const char *keyword;     /* keyword, or NULL if none */
455     const char *value;       /* keyword value, or NULL if kw==NULL */
456 } CanonicalizationMap;
457 
458 /**
459  * A map to canonicalize locale IDs.  This handles a variety of
460  * different semantic kinds of transformations.
461  */
462 static const CanonicalizationMap CANONICALIZE_MAP[] = {
463     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
464     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
465     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
466     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
467     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
468     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
469     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
470     { "cel_GAULISH",    "cel__GAULISH", NULL, NULL }, /* registered name */
471     { "de_1901",        "de__1901", NULL, NULL }, /* registered name */
472     { "de_1906",        "de__1906", NULL, NULL }, /* registered name */
473     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
474     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
475     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
476     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
477     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
478     { "en_BOONT",       "en__BOONT", NULL, NULL }, /* registered name */
479     { "en_SCOUSE",      "en__SCOUSE", NULL, NULL }, /* registered name */
480     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
481     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
482     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
483     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
484     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
485     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
486     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
487     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
488     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
489     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
490     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
491     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
492     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
493     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
494     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
495     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
496     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
497     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
498     { "sl_ROZAJ",       "sl__ROZAJ", NULL, NULL }, /* registered name */
499     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
500     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
501     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
502     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
503     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
504     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
505     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
506     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
507     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
508     { "zh_GAN",         "zh__GAN", NULL, NULL }, /* registered name */
509     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
510     { "zh_HAKKA",       "zh__HAKKA", NULL, NULL }, /* registered name */
511     { "zh_MIN",         "zh__MIN", NULL, NULL }, /* registered name */
512     { "zh_MIN_NAN",     "zh__MINNAN", NULL, NULL }, /* registered name */
513     { "zh_WUU",         "zh__WUU", NULL, NULL }, /* registered name */
514     { "zh_XIANG",       "zh__XIANG", NULL, NULL }, /* registered name */
515     { "zh_YUE",         "zh__YUE", NULL, NULL }, /* registered name */
516 };
517 
518 typedef struct VariantMap {
519     const char *variant;          /* input ID */
520     const char *keyword;     /* keyword, or NULL if none */
521     const char *value;       /* keyword value, or NULL if kw==NULL */
522 } VariantMap;
523 
524 static const VariantMap VARIANT_MAP[] = {
525     { "EURO",   "currency", "EUR" },
526     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
527     { "STROKE", "collation", "stroke" }  /* Solaris variant */
528 };
529 
530 /* ### BCP47 Conversion *******************************************/
531 /* Test if the locale id has BCP47 u extension and does not have '@' */
532 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
533 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
534 #define _ConvertBCP47(finalID, id, buffer, length,err) \
535         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
536             finalID=id; \
537         } else { \
538             finalID=buffer; \
539         }
540 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)541 static int32_t getShortestSubtagLength(const char *localeID) {
542     int32_t localeIDLength = uprv_strlen(localeID);
543     int32_t length = localeIDLength;
544     int32_t tmpLength = 0;
545     int32_t i;
546     UBool reset = TRUE;
547 
548     for (i = 0; i < localeIDLength; i++) {
549         if (localeID[i] != '_' && localeID[i] != '-') {
550             if (reset) {
551                 tmpLength = 0;
552                 reset = FALSE;
553             }
554             tmpLength++;
555         } else {
556             if (tmpLength != 0 && tmpLength < length) {
557                 length = tmpLength;
558             }
559             reset = TRUE;
560         }
561     }
562 
563     return length;
564 }
565 
566 /* ### Keywords **************************************************/
567 
568 #define ULOC_KEYWORD_BUFFER_LEN 25
569 #define ULOC_MAX_NO_KEYWORDS 25
570 
571 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)572 locale_getKeywordsStart(const char *localeID) {
573     const char *result = NULL;
574     if((result = uprv_strchr(localeID, '@')) != NULL) {
575         return result;
576     }
577 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
578     else {
579         /* We do this because the @ sign is variant, and the @ sign used on one
580         EBCDIC machine won't be compiled the same way on other EBCDIC based
581         machines. */
582         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
583         const uint8_t *charToFind = ebcdicSigns;
584         while(*charToFind) {
585             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
586                 return result;
587             }
588             charToFind++;
589         }
590     }
591 #endif
592     return NULL;
593 }
594 
595 /**
596  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
597  * @param keywordName incoming name to be canonicalized
598  * @param status return status (keyword too long)
599  * @return length of the keyword name
600  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)601 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
602 {
603   int32_t i;
604   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
605 
606   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
607     /* keyword name too long for internal buffer */
608     *status = U_INTERNAL_PROGRAM_ERROR;
609           return 0;
610   }
611 
612   /* normalize the keyword name */
613   for(i = 0; i < keywordNameLen; i++) {
614     buf[i] = uprv_tolower(keywordName[i]);
615   }
616   buf[i] = 0;
617 
618   return keywordNameLen;
619 }
620 
621 typedef struct {
622     char keyword[ULOC_KEYWORD_BUFFER_LEN];
623     int32_t keywordLen;
624     const char *valueStart;
625     int32_t valueLen;
626 } KeywordStruct;
627 
628 static int32_t U_CALLCONV
compareKeywordStructs(const void * context,const void * left,const void * right)629 compareKeywordStructs(const void *context, const void *left, const void *right) {
630     const char* leftString = ((const KeywordStruct *)left)->keyword;
631     const char* rightString = ((const KeywordStruct *)right)->keyword;
632     return uprv_strcmp(leftString, rightString);
633 }
634 
635 /**
636  * Both addKeyword and addValue must already be in canonical form.
637  * Either both addKeyword and addValue are NULL, or neither is NULL.
638  * If they are not NULL they must be zero terminated.
639  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
640  */
641 static int32_t
_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,const char * addKeyword,const char * addValue,UErrorCode * status)642 _getKeywords(const char *localeID,
643              char prev,
644              char *keywords, int32_t keywordCapacity,
645              char *values, int32_t valuesCapacity, int32_t *valLen,
646              UBool valuesToo,
647              const char* addKeyword,
648              const char* addValue,
649              UErrorCode *status)
650 {
651     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
652 
653     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
654     int32_t numKeywords = 0;
655     const char* pos = localeID;
656     const char* equalSign = NULL;
657     const char* semicolon = NULL;
658     int32_t i = 0, j, n;
659     int32_t keywordsLen = 0;
660     int32_t valuesLen = 0;
661 
662     if(prev == '@') { /* start of keyword definition */
663         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
664         do {
665             UBool duplicate = FALSE;
666             /* skip leading spaces */
667             while(*pos == ' ') {
668                 pos++;
669             }
670             if (!*pos) { /* handle trailing "; " */
671                 break;
672             }
673             if(numKeywords == maxKeywords) {
674                 *status = U_INTERNAL_PROGRAM_ERROR;
675                 return 0;
676             }
677             equalSign = uprv_strchr(pos, '=');
678             semicolon = uprv_strchr(pos, ';');
679             /* lack of '=' [foo@currency] is illegal */
680             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
681             if(!equalSign || (semicolon && semicolon<equalSign)) {
682                 *status = U_INVALID_FORMAT_ERROR;
683                 return 0;
684             }
685             /* need to normalize both keyword and keyword name */
686             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
687                 /* keyword name too long for internal buffer */
688                 *status = U_INTERNAL_PROGRAM_ERROR;
689                 return 0;
690             }
691             for(i = 0, n = 0; i < equalSign - pos; ++i) {
692                 if (pos[i] != ' ') {
693                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
694                 }
695             }
696             keywordList[numKeywords].keyword[n] = 0;
697             keywordList[numKeywords].keywordLen = n;
698             /* now grab the value part. First we skip the '=' */
699             equalSign++;
700             /* then we leading spaces */
701             while(*equalSign == ' ') {
702                 equalSign++;
703             }
704             keywordList[numKeywords].valueStart = equalSign;
705 
706             pos = semicolon;
707             i = 0;
708             if(pos) {
709                 while(*(pos - i - 1) == ' ') {
710                     i++;
711                 }
712                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
713                 pos++;
714             } else {
715                 i = (int32_t)uprv_strlen(equalSign);
716                 /* BEGIN android-changed
717                    For http://b/issue?id=6008774 : out-of-boundary memory access */
718                 while(i && equalSign[i-1] == ' ') {
719                     i--;
720                 }
721                 /* END android-changed */
722                 keywordList[numKeywords].valueLen = i;
723             }
724             /* If this is a duplicate keyword, then ignore it */
725             for (j=0; j<numKeywords; ++j) {
726                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
727                     duplicate = TRUE;
728                     break;
729                 }
730             }
731             if (!duplicate) {
732                 ++numKeywords;
733             }
734         } while(pos);
735 
736         /* Handle addKeyword/addValue. */
737         if (addKeyword != NULL) {
738             UBool duplicate = FALSE;
739             U_ASSERT(addValue != NULL);
740             /* Search for duplicate; if found, do nothing. Explicit keyword
741                overrides addKeyword. */
742             for (j=0; j<numKeywords; ++j) {
743                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
744                     duplicate = TRUE;
745                     break;
746                 }
747             }
748             if (!duplicate) {
749                 if (numKeywords == maxKeywords) {
750                     *status = U_INTERNAL_PROGRAM_ERROR;
751                     return 0;
752                 }
753                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
754                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
755                 keywordList[numKeywords].valueStart = addValue;
756                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
757                 ++numKeywords;
758             }
759         } else {
760             U_ASSERT(addValue == NULL);
761         }
762 
763         /* now we have a list of keywords */
764         /* we need to sort it */
765         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
766 
767         /* Now construct the keyword part */
768         for(i = 0; i < numKeywords; i++) {
769             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
770                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
771                 if(valuesToo) {
772                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
773                 } else {
774                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
775                 }
776             }
777             keywordsLen += keywordList[i].keywordLen + 1;
778             if(valuesToo) {
779                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
780                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
781                 }
782                 keywordsLen += keywordList[i].valueLen;
783 
784                 if(i < numKeywords - 1) {
785                     if(keywordsLen < keywordCapacity) {
786                         keywords[keywordsLen] = ';';
787                     }
788                     keywordsLen++;
789                 }
790             }
791             if(values) {
792                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
793                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
794                     values[valuesLen + keywordList[i].valueLen] = 0;
795                 }
796                 valuesLen += keywordList[i].valueLen + 1;
797             }
798         }
799         if(values) {
800             values[valuesLen] = 0;
801             if(valLen) {
802                 *valLen = valuesLen;
803             }
804         }
805         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
806     } else {
807         return 0;
808     }
809 }
810 
811 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)812 locale_getKeywords(const char *localeID,
813                    char prev,
814                    char *keywords, int32_t keywordCapacity,
815                    char *values, int32_t valuesCapacity, int32_t *valLen,
816                    UBool valuesToo,
817                    UErrorCode *status) {
818     return _getKeywords(localeID, prev, keywords, keywordCapacity,
819                         values, valuesCapacity, valLen, valuesToo,
820                         NULL, NULL, status);
821 }
822 
823 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)824 uloc_getKeywordValue(const char* localeID,
825                      const char* keywordName,
826                      char* buffer, int32_t bufferCapacity,
827                      UErrorCode* status)
828 {
829     const char* startSearchHere = NULL;
830     const char* nextSeparator = NULL;
831     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
832     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
833     int32_t i = 0;
834     int32_t result = 0;
835 
836     if(status && U_SUCCESS(*status) && localeID) {
837       char tempBuffer[ULOC_FULLNAME_CAPACITY];
838       const char* tmpLocaleID;
839 
840       if (_hasBCP47Extension(localeID)) {
841           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
842       } else {
843           tmpLocaleID=localeID;
844       }
845 
846       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
847       if(startSearchHere == NULL) {
848           /* no keywords, return at once */
849           return 0;
850       }
851 
852       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
853       if(U_FAILURE(*status)) {
854         return 0;
855       }
856 
857       /* find the first keyword */
858       while(startSearchHere) {
859           startSearchHere++;
860           /* skip leading spaces (allowed?) */
861           while(*startSearchHere == ' ') {
862               startSearchHere++;
863           }
864           nextSeparator = uprv_strchr(startSearchHere, '=');
865           /* need to normalize both keyword and keyword name */
866           if(!nextSeparator) {
867               break;
868           }
869           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
870               /* keyword name too long for internal buffer */
871               *status = U_INTERNAL_PROGRAM_ERROR;
872               return 0;
873           }
874           for(i = 0; i < nextSeparator - startSearchHere; i++) {
875               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
876           }
877           /* trim trailing spaces */
878           while(startSearchHere[i-1] == ' ') {
879               i--;
880           }
881           localeKeywordNameBuffer[i] = 0;
882 
883           startSearchHere = uprv_strchr(nextSeparator, ';');
884 
885           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
886               nextSeparator++;
887               while(*nextSeparator == ' ') {
888                   nextSeparator++;
889               }
890               /* we actually found the keyword. Copy the value */
891               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
892                   while(*(startSearchHere-1) == ' ') {
893                       startSearchHere--;
894                   }
895                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
896                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
897               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
898                   i = (int32_t)uprv_strlen(nextSeparator);
899                   while(nextSeparator[i - 1] == ' ') {
900                       i--;
901                   }
902                   uprv_strncpy(buffer, nextSeparator, i);
903                   result = u_terminateChars(buffer, bufferCapacity, i, status);
904               } else {
905                   /* give a bigger buffer, please */
906                   *status = U_BUFFER_OVERFLOW_ERROR;
907                   if(startSearchHere) {
908                       result = (int32_t)(startSearchHere - nextSeparator);
909                   } else {
910                       result = (int32_t)uprv_strlen(nextSeparator);
911                   }
912               }
913               return result;
914           }
915       }
916     }
917     return 0;
918 }
919 
920 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)921 uloc_setKeywordValue(const char* keywordName,
922                      const char* keywordValue,
923                      char* buffer, int32_t bufferCapacity,
924                      UErrorCode* status)
925 {
926     /* TODO: sorting. removal. */
927     int32_t keywordNameLen;
928     int32_t keywordValueLen;
929     int32_t bufLen;
930     int32_t needLen = 0;
931     int32_t foundValueLen;
932     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
933     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
934     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
935     int32_t i = 0;
936     int32_t rc;
937     char* nextSeparator = NULL;
938     char* nextEqualsign = NULL;
939     char* startSearchHere = NULL;
940     char* keywordStart = NULL;
941     char *insertHere = NULL;
942     if(U_FAILURE(*status)) {
943         return -1;
944     }
945     if(bufferCapacity>1) {
946         bufLen = (int32_t)uprv_strlen(buffer);
947     } else {
948         *status = U_ILLEGAL_ARGUMENT_ERROR;
949         return 0;
950     }
951     if(bufferCapacity<bufLen) {
952         /* The capacity is less than the length?! Is this NULL terminated? */
953         *status = U_ILLEGAL_ARGUMENT_ERROR;
954         return 0;
955     }
956     if(keywordValue && !*keywordValue) {
957         keywordValue = NULL;
958     }
959     if(keywordValue) {
960         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
961     } else {
962         keywordValueLen = 0;
963     }
964     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
965     if(U_FAILURE(*status)) {
966         return 0;
967     }
968     startSearchHere = (char*)locale_getKeywordsStart(buffer);
969     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
970         if(!keywordValue) { /* no keywords = nothing to remove */
971             return bufLen;
972         }
973 
974         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
975         if(startSearchHere) { /* had a single @ */
976             needLen--; /* already had the @ */
977             /* startSearchHere points at the @ */
978         } else {
979             startSearchHere=buffer+bufLen;
980         }
981         if(needLen >= bufferCapacity) {
982             *status = U_BUFFER_OVERFLOW_ERROR;
983             return needLen; /* no change */
984         }
985         *startSearchHere = '@';
986         startSearchHere++;
987         uprv_strcpy(startSearchHere, keywordNameBuffer);
988         startSearchHere += keywordNameLen;
989         *startSearchHere = '=';
990         startSearchHere++;
991         uprv_strcpy(startSearchHere, keywordValue);
992         startSearchHere+=keywordValueLen;
993         return needLen;
994     } /* end shortcut - no @ */
995 
996     keywordStart = startSearchHere;
997     /* search for keyword */
998     while(keywordStart) {
999         keywordStart++;
1000         /* skip leading spaces (allowed?) */
1001         while(*keywordStart == ' ') {
1002             keywordStart++;
1003         }
1004         nextEqualsign = uprv_strchr(keywordStart, '=');
1005         /* need to normalize both keyword and keyword name */
1006         if(!nextEqualsign) {
1007             break;
1008         }
1009         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1010             /* keyword name too long for internal buffer */
1011             *status = U_INTERNAL_PROGRAM_ERROR;
1012             return 0;
1013         }
1014         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1015             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1016         }
1017         /* trim trailing spaces */
1018         while(keywordStart[i-1] == ' ') {
1019             i--;
1020         }
1021         localeKeywordNameBuffer[i] = 0;
1022 
1023         nextSeparator = uprv_strchr(nextEqualsign, ';');
1024         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1025         if(rc == 0) {
1026             nextEqualsign++;
1027             while(*nextEqualsign == ' ') {
1028                 nextEqualsign++;
1029             }
1030             /* we actually found the keyword. Change the value */
1031             if (nextSeparator) {
1032                 keywordAtEnd = 0;
1033                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1034             } else {
1035                 keywordAtEnd = 1;
1036                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1037             }
1038             if(keywordValue) { /* adding a value - not removing */
1039               if(foundValueLen == keywordValueLen) {
1040                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1041                 return bufLen; /* no change in size */
1042               } else if(foundValueLen > keywordValueLen) {
1043                 int32_t delta = foundValueLen - keywordValueLen;
1044                 if(nextSeparator) { /* RH side */
1045                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1046                 }
1047                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1048                 bufLen -= delta;
1049                 buffer[bufLen]=0;
1050                 return bufLen;
1051               } else { /* FVL < KVL */
1052                 int32_t delta = keywordValueLen - foundValueLen;
1053                 if((bufLen+delta) >= bufferCapacity) {
1054                   *status = U_BUFFER_OVERFLOW_ERROR;
1055                   return bufLen+delta;
1056                 }
1057                 if(nextSeparator) { /* RH side */
1058                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1059                 }
1060                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1061                 bufLen += delta;
1062                 buffer[bufLen]=0;
1063                 return bufLen;
1064               }
1065             } else { /* removing a keyword */
1066               if(keywordAtEnd) {
1067                 /* zero out the ';' or '@' just before startSearchhere */
1068                 keywordStart[-1] = 0;
1069                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1070               } else {
1071                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1072                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1073                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1074               }
1075             }
1076         } else if(rc<0){ /* end match keyword */
1077           /* could insert at this location. */
1078           insertHere = keywordStart;
1079         }
1080         keywordStart = nextSeparator;
1081     } /* end loop searching */
1082 
1083     if(!keywordValue) {
1084       return bufLen; /* removal of non-extant keyword - no change */
1085     }
1086 
1087     /* we know there is at least one keyword. */
1088     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1089     if(needLen >= bufferCapacity) {
1090         *status = U_BUFFER_OVERFLOW_ERROR;
1091         return needLen; /* no change */
1092     }
1093 
1094     if(insertHere) {
1095       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1096       keywordStart = insertHere;
1097     } else {
1098       keywordStart = buffer+bufLen;
1099       *keywordStart = ';';
1100       keywordStart++;
1101     }
1102     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1103     keywordStart += keywordNameLen;
1104     *keywordStart = '=';
1105     keywordStart++;
1106     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1107     keywordStart+=keywordValueLen;
1108     if(insertHere) {
1109       *keywordStart = ';';
1110       keywordStart++;
1111     }
1112     buffer[needLen]=0;
1113     return needLen;
1114 }
1115 
1116 /* ### ID parsing implementation **************************************************/
1117 
1118 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1119 
1120 /*returns TRUE if one of the special prefixes is here (s=string)
1121   'x-' or 'i-' */
1122 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1123 
1124 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1125  * except for variant
1126  */
1127 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1128 
_strnchr(const char * str,int32_t len,char c)1129 static char* _strnchr(const char* str, int32_t len, char c) {
1130     U_ASSERT(str != 0 && len >= 0);
1131     while (len-- != 0) {
1132         char d = *str;
1133         if (d == c) {
1134             return (char*) str;
1135         } else if (d == 0) {
1136             break;
1137         }
1138         ++str;
1139     }
1140     return NULL;
1141 }
1142 
1143 /**
1144  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1145  * a NULL entry, followed by more entries, and a second NULL entry.
1146  *
1147  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1148  * COUNTRIES_3.
1149  */
_findIndex(const char * const * list,const char * key)1150 static int16_t _findIndex(const char* const* list, const char* key)
1151 {
1152     const char* const* anchor = list;
1153     int32_t pass = 0;
1154 
1155     /* Make two passes through two NULL-terminated arrays at 'list' */
1156     while (pass++ < 2) {
1157         while (*list) {
1158             if (uprv_strcmp(key, *list) == 0) {
1159                 return (int16_t)(list - anchor);
1160             }
1161             list++;
1162         }
1163         ++list;     /* skip final NULL *CWB*/
1164     }
1165     return -1;
1166 }
1167 
1168 /* count the length of src while copying it to dest; return strlen(src) */
1169 static U_INLINE int32_t
_copyCount(char * dest,int32_t destCapacity,const char * src)1170 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1171     const char *anchor;
1172     char c;
1173 
1174     anchor=src;
1175     for(;;) {
1176         if((c=*src)==0) {
1177             return (int32_t)(src-anchor);
1178         }
1179         if(destCapacity<=0) {
1180             return (int32_t)((src-anchor)+uprv_strlen(src));
1181         }
1182         ++src;
1183         *dest++=c;
1184         --destCapacity;
1185     }
1186 }
1187 
1188 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1189 uloc_getCurrentCountryID(const char* oldID){
1190     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1191     if (offset >= 0) {
1192         return REPLACEMENT_COUNTRIES[offset];
1193     }
1194     return oldID;
1195 }
1196 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1197 uloc_getCurrentLanguageID(const char* oldID){
1198     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1199     if (offset >= 0) {
1200         return REPLACEMENT_LANGUAGES[offset];
1201     }
1202     return oldID;
1203 }
1204 /*
1205  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1206  * avoid duplicating code to handle the earlier locale ID pieces
1207  * in the functions for the later ones by
1208  * setting the *pEnd pointer to where they stopped parsing
1209  *
1210  * TODO try to use this in Locale
1211  */
1212 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1213 ulocimp_getLanguage(const char *localeID,
1214                     char *language, int32_t languageCapacity,
1215                     const char **pEnd) {
1216     int32_t i=0;
1217     int32_t offset;
1218     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1219 
1220     /* if it starts with i- or x- then copy that prefix */
1221     if(_isIDPrefix(localeID)) {
1222         if(i<languageCapacity) {
1223             language[i]=(char)uprv_tolower(*localeID);
1224         }
1225         if(i<languageCapacity) {
1226             language[i+1]='-';
1227         }
1228         i+=2;
1229         localeID+=2;
1230     }
1231 
1232     /* copy the language as far as possible and count its length */
1233     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1234         if(i<languageCapacity) {
1235             language[i]=(char)uprv_tolower(*localeID);
1236         }
1237         if(i<3) {
1238             lang[i]=(char)uprv_tolower(*localeID);
1239         }
1240         i++;
1241         localeID++;
1242     }
1243 
1244     if(i==3) {
1245         /* convert 3 character code to 2 character code if possible *CWB*/
1246         offset=_findIndex(LANGUAGES_3, lang);
1247         if(offset>=0) {
1248             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1249         }
1250     }
1251 
1252     if(pEnd!=NULL) {
1253         *pEnd=localeID;
1254     }
1255     return i;
1256 }
1257 
1258 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1259 ulocimp_getScript(const char *localeID,
1260                   char *script, int32_t scriptCapacity,
1261                   const char **pEnd)
1262 {
1263     int32_t idLen = 0;
1264 
1265     if (pEnd != NULL) {
1266         *pEnd = localeID;
1267     }
1268 
1269     /* copy the second item as far as possible and count its length */
1270     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1271         idLen++;
1272     }
1273 
1274     /* If it's exactly 4 characters long, then it's a script and not a country. */
1275     if (idLen == 4) {
1276         int32_t i;
1277         if (pEnd != NULL) {
1278             *pEnd = localeID+idLen;
1279         }
1280         if(idLen > scriptCapacity) {
1281             idLen = scriptCapacity;
1282         }
1283         if (idLen >= 1) {
1284             script[0]=(char)uprv_toupper(*(localeID++));
1285         }
1286         for (i = 1; i < idLen; i++) {
1287             script[i]=(char)uprv_tolower(*(localeID++));
1288         }
1289     }
1290     else {
1291         idLen = 0;
1292     }
1293     return idLen;
1294 }
1295 
1296 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1297 ulocimp_getCountry(const char *localeID,
1298                    char *country, int32_t countryCapacity,
1299                    const char **pEnd)
1300 {
1301     int32_t idLen=0;
1302     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1303     int32_t offset;
1304 
1305     /* copy the country as far as possible and count its length */
1306     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1307         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1308             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1309         }
1310         idLen++;
1311     }
1312 
1313     /* the country should be either length 2 or 3 */
1314     if (idLen == 2 || idLen == 3) {
1315         UBool gotCountry = FALSE;
1316         /* convert 3 character code to 2 character code if possible *CWB*/
1317         if(idLen==3) {
1318             offset=_findIndex(COUNTRIES_3, cnty);
1319             if(offset>=0) {
1320                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1321                 gotCountry = TRUE;
1322             }
1323         }
1324         if (!gotCountry) {
1325             int32_t i = 0;
1326             for (i = 0; i < idLen; i++) {
1327                 if (i < countryCapacity) {
1328                     country[i]=(char)uprv_toupper(localeID[i]);
1329                 }
1330             }
1331         }
1332         localeID+=idLen;
1333     } else {
1334         idLen = 0;
1335     }
1336 
1337     if(pEnd!=NULL) {
1338         *pEnd=localeID;
1339     }
1340 
1341     return idLen;
1342 }
1343 
1344 /**
1345  * @param needSeparator if true, then add leading '_' if any variants
1346  * are added to 'variant'
1347  */
1348 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1349 _getVariantEx(const char *localeID,
1350               char prev,
1351               char *variant, int32_t variantCapacity,
1352               UBool needSeparator) {
1353     int32_t i=0;
1354 
1355     /* get one or more variant tags and separate them with '_' */
1356     if(_isIDSeparator(prev)) {
1357         /* get a variant string after a '-' or '_' */
1358         while(!_isTerminator(*localeID)) {
1359             if (needSeparator) {
1360                 if (i<variantCapacity) {
1361                     variant[i] = '_';
1362                 }
1363                 ++i;
1364                 needSeparator = FALSE;
1365             }
1366             if(i<variantCapacity) {
1367                 variant[i]=(char)uprv_toupper(*localeID);
1368                 if(variant[i]=='-') {
1369                     variant[i]='_';
1370                 }
1371             }
1372             i++;
1373             localeID++;
1374         }
1375     }
1376 
1377     /* if there is no variant tag after a '-' or '_' then look for '@' */
1378     if(i==0) {
1379         if(prev=='@') {
1380             /* keep localeID */
1381         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1382             ++localeID; /* point after the '@' */
1383         } else {
1384             return 0;
1385         }
1386         while(!_isTerminator(*localeID)) {
1387             if (needSeparator) {
1388                 if (i<variantCapacity) {
1389                     variant[i] = '_';
1390                 }
1391                 ++i;
1392                 needSeparator = FALSE;
1393             }
1394             if(i<variantCapacity) {
1395                 variant[i]=(char)uprv_toupper(*localeID);
1396                 if(variant[i]=='-' || variant[i]==',') {
1397                     variant[i]='_';
1398                 }
1399             }
1400             i++;
1401             localeID++;
1402         }
1403     }
1404 
1405     return i;
1406 }
1407 
1408 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1409 _getVariant(const char *localeID,
1410             char prev,
1411             char *variant, int32_t variantCapacity) {
1412     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1413 }
1414 
1415 /**
1416  * Delete ALL instances of a variant from the given list of one or
1417  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1418  * @param variants the source string of one or more variants,
1419  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1420  * terminated; if it is, trailing zero will NOT be maintained.
1421  * @param variantsLen length of variants
1422  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1423  * or "PREEURO"; not zero terminated
1424  * @param toDeleteLen length of toDelete
1425  * @return number of characters deleted from variants
1426  */
1427 static int32_t
_deleteVariant(char * variants,int32_t variantsLen,const char * toDelete,int32_t toDeleteLen)1428 _deleteVariant(char* variants, int32_t variantsLen,
1429                const char* toDelete, int32_t toDeleteLen)
1430 {
1431     int32_t delta = 0; /* number of chars deleted */
1432     for (;;) {
1433         UBool flag = FALSE;
1434         if (variantsLen < toDeleteLen) {
1435             return delta;
1436         }
1437         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1438             (variantsLen == toDeleteLen ||
1439              (flag=(variants[toDeleteLen] == '_'))))
1440         {
1441             int32_t d = toDeleteLen + (flag?1:0);
1442             variantsLen -= d;
1443             delta += d;
1444             if (variantsLen > 0) {
1445                 uprv_memmove(variants, variants+d, variantsLen);
1446             }
1447         } else {
1448             char* p = _strnchr(variants, variantsLen, '_');
1449             if (p == NULL) {
1450                 return delta;
1451             }
1452             ++p;
1453             variantsLen -= (int32_t)(p - variants);
1454             variants = p;
1455         }
1456     }
1457 }
1458 
1459 /* Keyword enumeration */
1460 
1461 typedef struct UKeywordsContext {
1462     char* keywords;
1463     char* current;
1464 } UKeywordsContext;
1465 
1466 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1467 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1468     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1469     uprv_free(enumerator->context);
1470     uprv_free(enumerator);
1471 }
1472 
1473 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode * status)1474 uloc_kw_countKeywords(UEnumeration *en, UErrorCode *status) {
1475     char *kw = ((UKeywordsContext *)en->context)->keywords;
1476     int32_t result = 0;
1477     while(*kw) {
1478         result++;
1479         kw += uprv_strlen(kw)+1;
1480     }
1481     return result;
1482 }
1483 
1484 static const char* U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode * status)1485 uloc_kw_nextKeyword(UEnumeration* en,
1486                     int32_t* resultLength,
1487                     UErrorCode* status) {
1488     const char* result = ((UKeywordsContext *)en->context)->current;
1489     int32_t len = 0;
1490     if(*result) {
1491         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1492         ((UKeywordsContext *)en->context)->current += len+1;
1493     } else {
1494         result = NULL;
1495     }
1496     if (resultLength) {
1497         *resultLength = len;
1498     }
1499     return result;
1500 }
1501 
1502 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode * status)1503 uloc_kw_resetKeywords(UEnumeration* en,
1504                       UErrorCode* status) {
1505     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1506 }
1507 
1508 static const UEnumeration gKeywordsEnum = {
1509     NULL,
1510     NULL,
1511     uloc_kw_closeKeywords,
1512     uloc_kw_countKeywords,
1513     uenum_unextDefault,
1514     uloc_kw_nextKeyword,
1515     uloc_kw_resetKeywords
1516 };
1517 
1518 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1519 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1520 {
1521     UKeywordsContext *myContext = NULL;
1522     UEnumeration *result = NULL;
1523 
1524     if(U_FAILURE(*status)) {
1525         return NULL;
1526     }
1527     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1528     /* Null pointer test */
1529     if (result == NULL) {
1530         *status = U_MEMORY_ALLOCATION_ERROR;
1531         return NULL;
1532     }
1533     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1534     myContext = uprv_malloc(sizeof(UKeywordsContext));
1535     if (myContext == NULL) {
1536         *status = U_MEMORY_ALLOCATION_ERROR;
1537         uprv_free(result);
1538         return NULL;
1539     }
1540     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1541     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1542     myContext->keywords[keywordListSize] = 0;
1543     myContext->current = myContext->keywords;
1544     result->context = myContext;
1545     return result;
1546 }
1547 
1548 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1549 uloc_openKeywords(const char* localeID,
1550                         UErrorCode* status)
1551 {
1552     int32_t i=0;
1553     char keywords[256];
1554     int32_t keywordsCapacity = 256;
1555     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1556     const char* tmpLocaleID;
1557 
1558     if(status==NULL || U_FAILURE(*status)) {
1559         return 0;
1560     }
1561 
1562     if (_hasBCP47Extension(localeID)) {
1563         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1564     } else {
1565         if (localeID==NULL) {
1566            localeID=uloc_getDefault();
1567         }
1568         tmpLocaleID=localeID;
1569     }
1570 
1571     /* Skip the language */
1572     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1573     if(_isIDSeparator(*tmpLocaleID)) {
1574         const char *scriptID;
1575         /* Skip the script if available */
1576         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1577         if(scriptID != tmpLocaleID+1) {
1578             /* Found optional script */
1579             tmpLocaleID = scriptID;
1580         }
1581         /* Skip the Country */
1582         if (_isIDSeparator(*tmpLocaleID)) {
1583             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1584             if(_isIDSeparator(*tmpLocaleID)) {
1585                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1586             }
1587         }
1588     }
1589 
1590     /* keywords are located after '@' */
1591     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1592         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1593     }
1594 
1595     if(i) {
1596         return uloc_openKeywordList(keywords, i, status);
1597     } else {
1598         return NULL;
1599     }
1600 }
1601 
1602 
1603 /* bit-flags for 'options' parameter of _canonicalize */
1604 #define _ULOC_STRIP_KEYWORDS 0x2
1605 #define _ULOC_CANONICALIZE   0x1
1606 
1607 #define OPTION_SET(options, mask) ((options & mask) != 0)
1608 
1609 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1610 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1611 
1612 /**
1613  * Canonicalize the given localeID, to level 1 or to level 2,
1614  * depending on the options.  To specify level 1, pass in options=0.
1615  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1616  *
1617  * This is the code underlying uloc_getName and uloc_canonicalize.
1618  */
1619 static int32_t
_canonicalize(const char * localeID,char * result,int32_t resultCapacity,uint32_t options,UErrorCode * err)1620 _canonicalize(const char* localeID,
1621               char* result,
1622               int32_t resultCapacity,
1623               uint32_t options,
1624               UErrorCode* err) {
1625     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1626     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1627     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1628     const char* origLocaleID;
1629     const char* tmpLocaleID;
1630     const char* keywordAssign = NULL;
1631     const char* separatorIndicator = NULL;
1632     const char* addKeyword = NULL;
1633     const char* addValue = NULL;
1634     char* name;
1635     char* variant = NULL; /* pointer into name, or NULL */
1636 
1637     if (U_FAILURE(*err)) {
1638         return 0;
1639     }
1640 
1641     if (_hasBCP47Extension(localeID)) {
1642         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1643     } else {
1644         if (localeID==NULL) {
1645            localeID=uloc_getDefault();
1646         }
1647         tmpLocaleID=localeID;
1648     }
1649 
1650     origLocaleID=tmpLocaleID;
1651 
1652     /* if we are doing a full canonicalization, then put results in
1653        localeBuffer, if necessary; otherwise send them to result. */
1654     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1655         (result == NULL || resultCapacity <  sizeof(localeBuffer))) {
1656         name = localeBuffer;
1657         nameCapacity = sizeof(localeBuffer);
1658     } else {
1659         name = result;
1660         nameCapacity = resultCapacity;
1661     }
1662 
1663     /* get all pieces, one after another, and separate with '_' */
1664     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1665 
1666     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1667         const char *d = uloc_getDefault();
1668 
1669         len = (int32_t)uprv_strlen(d);
1670 
1671         if (name != NULL) {
1672             uprv_strncpy(name, d, len);
1673         }
1674     } else if(_isIDSeparator(*tmpLocaleID)) {
1675         const char *scriptID;
1676 
1677         ++fieldCount;
1678         if(len<nameCapacity) {
1679             name[len]='_';
1680         }
1681         ++len;
1682 
1683         scriptSize=ulocimp_getScript(tmpLocaleID+1, name+len, nameCapacity-len, &scriptID);
1684         if(scriptSize > 0) {
1685             /* Found optional script */
1686             tmpLocaleID = scriptID;
1687             ++fieldCount;
1688             len+=scriptSize;
1689             if (_isIDSeparator(*tmpLocaleID)) {
1690                 /* If there is something else, then we add the _ */
1691                 if(len<nameCapacity) {
1692                     name[len]='_';
1693                 }
1694                 ++len;
1695             }
1696         }
1697 
1698         if (_isIDSeparator(*tmpLocaleID)) {
1699             const char *cntryID;
1700             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1, name+len, nameCapacity-len, &cntryID);
1701             if (cntrySize > 0) {
1702                 /* Found optional country */
1703                 tmpLocaleID = cntryID;
1704                 len+=cntrySize;
1705             }
1706             if(_isIDSeparator(*tmpLocaleID)) {
1707                 /* If there is something else, then we add the _  if we found country before.*/
1708                 if (cntrySize > 0) {
1709                     ++fieldCount;
1710                     if(len<nameCapacity) {
1711                         name[len]='_';
1712                     }
1713                     ++len;
1714                 }
1715 
1716                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID, name+len, nameCapacity-len);
1717                 if (variantSize > 0) {
1718                     variant = name+len;
1719                     len += variantSize;
1720                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1721                 }
1722             }
1723         }
1724     }
1725 
1726     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1727     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1728         UBool done = FALSE;
1729         do {
1730             char c = *tmpLocaleID;
1731             switch (c) {
1732             case 0:
1733             case '@':
1734                 done = TRUE;
1735                 break;
1736             default:
1737                 if (len<nameCapacity) {
1738                     name[len] = c;
1739                 }
1740                 ++len;
1741                 ++tmpLocaleID;
1742                 break;
1743             }
1744         } while (!done);
1745     }
1746 
1747     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1748        After this, tmpLocaleID either points to '@' or is NULL */
1749     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1750         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1751         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1752     }
1753 
1754     /* Copy POSIX-style variant, if any [mr@FOO] */
1755     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1756         tmpLocaleID != NULL && keywordAssign == NULL) {
1757         for (;;) {
1758             char c = *tmpLocaleID;
1759             if (c == 0) {
1760                 break;
1761             }
1762             if (len<nameCapacity) {
1763                 name[len] = c;
1764             }
1765             ++len;
1766             ++tmpLocaleID;
1767         }
1768     }
1769 
1770     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1771         /* Handle @FOO variant if @ is present and not followed by = */
1772         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1773             int32_t posixVariantSize;
1774             /* Add missing '_' if needed */
1775             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1776                 do {
1777                     if(len<nameCapacity) {
1778                         name[len]='_';
1779                     }
1780                     ++len;
1781                     ++fieldCount;
1782                 } while(fieldCount<2);
1783             }
1784             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1785                                              (UBool)(variantSize > 0));
1786             if (posixVariantSize > 0) {
1787                 if (variant == NULL) {
1788                     variant = name+len;
1789                 }
1790                 len += posixVariantSize;
1791                 variantSize += posixVariantSize;
1792             }
1793         }
1794 
1795         /* Handle generic variants first */
1796         if (variant) {
1797             for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1798                 const char* variantToCompare = VARIANT_MAP[j].variant;
1799                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1800                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1801                 len -= variantLen;
1802                 if (variantLen > 0) {
1803                     /* BEGIN android-changed
1804                        Apply fixes for ICU ticket8984. */
1805                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1806                         --len;
1807                     }
1808                     /* END android-changed */
1809                     addKeyword = VARIANT_MAP[j].keyword;
1810                     addValue = VARIANT_MAP[j].value;
1811                     break;
1812                 }
1813             }
1814             /* BEGIN android-changed
1815                Apply fixes for ICU ticket8984. */
1816             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1817                 --len;
1818             }
1819             /* END android-changed */
1820         }
1821 
1822         /* Look up the ID in the canonicalization map */
1823         for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1824             const char* id = CANONICALIZE_MAP[j].id;
1825             int32_t n = (int32_t)uprv_strlen(id);
1826             if (len == n && uprv_strncmp(name, id, n) == 0) {
1827                 if (n == 0 && tmpLocaleID != NULL) {
1828                     break; /* Don't remap "" if keywords present */
1829                 }
1830                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1831                 if (CANONICALIZE_MAP[j].keyword) {
1832                     addKeyword = CANONICALIZE_MAP[j].keyword;
1833                     addValue = CANONICALIZE_MAP[j].value;
1834                 }
1835                 break;
1836             }
1837         }
1838     }
1839 
1840     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1841         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1842             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1843             if(len<nameCapacity) {
1844                 name[len]='@';
1845             }
1846             ++len;
1847             ++fieldCount;
1848             len += _getKeywords(tmpLocaleID+1, '@', name+len, nameCapacity-len, NULL, 0, NULL, TRUE,
1849                                 addKeyword, addValue, err);
1850         } else if (addKeyword != NULL) {
1851             U_ASSERT(addValue != NULL);
1852             /* inelegant but works -- later make _getKeywords do this? */
1853             len += _copyCount(name+len, nameCapacity-len, "@");
1854             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1855             len += _copyCount(name+len, nameCapacity-len, "=");
1856             len += _copyCount(name+len, nameCapacity-len, addValue);
1857         }
1858     }
1859 
1860     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1861         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1862     }
1863 
1864     return u_terminateChars(result, resultCapacity, len, err);
1865 }
1866 
1867 /* ### ID parsing API **************************************************/
1868 
1869 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1870 uloc_getParent(const char*    localeID,
1871                char* parent,
1872                int32_t parentCapacity,
1873                UErrorCode* err)
1874 {
1875     const char *lastUnderscore;
1876     int32_t i;
1877 
1878     if (U_FAILURE(*err))
1879         return 0;
1880 
1881     if (localeID == NULL)
1882         localeID = uloc_getDefault();
1883 
1884     lastUnderscore=uprv_strrchr(localeID, '_');
1885     if(lastUnderscore!=NULL) {
1886         i=(int32_t)(lastUnderscore-localeID);
1887     } else {
1888         i=0;
1889     }
1890 
1891     if(i>0 && parent != localeID) {
1892         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1893     }
1894     return u_terminateChars(parent, parentCapacity, i, err);
1895 }
1896 
1897 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1898 uloc_getLanguage(const char*    localeID,
1899          char* language,
1900          int32_t languageCapacity,
1901          UErrorCode* err)
1902 {
1903     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1904     int32_t i=0;
1905 
1906     if (err==NULL || U_FAILURE(*err)) {
1907         return 0;
1908     }
1909 
1910     if(localeID==NULL) {
1911         localeID=uloc_getDefault();
1912     }
1913 
1914     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1915     return u_terminateChars(language, languageCapacity, i, err);
1916 }
1917 
1918 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1919 uloc_getScript(const char*    localeID,
1920          char* script,
1921          int32_t scriptCapacity,
1922          UErrorCode* err)
1923 {
1924     int32_t i=0;
1925 
1926     if(err==NULL || U_FAILURE(*err)) {
1927         return 0;
1928     }
1929 
1930     if(localeID==NULL) {
1931         localeID=uloc_getDefault();
1932     }
1933 
1934     /* skip the language */
1935     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1936     if(_isIDSeparator(*localeID)) {
1937         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1938     }
1939     return u_terminateChars(script, scriptCapacity, i, err);
1940 }
1941 
1942 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1943 uloc_getCountry(const char* localeID,
1944             char* country,
1945             int32_t countryCapacity,
1946             UErrorCode* err)
1947 {
1948     int32_t i=0;
1949 
1950     if(err==NULL || U_FAILURE(*err)) {
1951         return 0;
1952     }
1953 
1954     if(localeID==NULL) {
1955         localeID=uloc_getDefault();
1956     }
1957 
1958     /* Skip the language */
1959     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1960     if(_isIDSeparator(*localeID)) {
1961         const char *scriptID;
1962         /* Skip the script if available */
1963         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1964         if(scriptID != localeID+1) {
1965             /* Found optional script */
1966             localeID = scriptID;
1967         }
1968         if(_isIDSeparator(*localeID)) {
1969             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1970         }
1971     }
1972     return u_terminateChars(country, countryCapacity, i, err);
1973 }
1974 
1975 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1976 uloc_getVariant(const char* localeID,
1977                 char* variant,
1978                 int32_t variantCapacity,
1979                 UErrorCode* err)
1980 {
1981     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1982     const char* tmpLocaleID;
1983     int32_t i=0;
1984 
1985     if(err==NULL || U_FAILURE(*err)) {
1986         return 0;
1987     }
1988 
1989     if (_hasBCP47Extension(localeID)) {
1990         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1991     } else {
1992         if (localeID==NULL) {
1993            localeID=uloc_getDefault();
1994         }
1995         tmpLocaleID=localeID;
1996     }
1997 
1998     /* Skip the language */
1999     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2000     if(_isIDSeparator(*tmpLocaleID)) {
2001         const char *scriptID;
2002         /* Skip the script if available */
2003         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2004         if(scriptID != tmpLocaleID+1) {
2005             /* Found optional script */
2006             tmpLocaleID = scriptID;
2007         }
2008         /* Skip the Country */
2009         if (_isIDSeparator(*tmpLocaleID)) {
2010             const char *cntryID;
2011             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2012             if (cntryID != tmpLocaleID+1) {
2013                 /* Found optional country */
2014                 tmpLocaleID = cntryID;
2015             }
2016             if(_isIDSeparator(*tmpLocaleID)) {
2017                 /* If there was no country ID, skip a possible extra IDSeparator */
2018                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2019                     tmpLocaleID++;
2020                 }
2021                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2022             }
2023         }
2024     }
2025 
2026     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2027     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2028 /*
2029     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2030         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2031     }
2032 */
2033     return u_terminateChars(variant, variantCapacity, i, err);
2034 }
2035 
2036 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2037 uloc_getName(const char* localeID,
2038              char* name,
2039              int32_t nameCapacity,
2040              UErrorCode* err)
2041 {
2042     return _canonicalize(localeID, name, nameCapacity, 0, err);
2043 }
2044 
2045 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2046 uloc_getBaseName(const char* localeID,
2047                  char* name,
2048                  int32_t nameCapacity,
2049                  UErrorCode* err)
2050 {
2051     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2052 }
2053 
2054 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2055 uloc_canonicalize(const char* localeID,
2056                   char* name,
2057                   int32_t nameCapacity,
2058                   UErrorCode* err)
2059 {
2060     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2061 }
2062 
2063 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)2064 uloc_getISO3Language(const char* localeID)
2065 {
2066     int16_t offset;
2067     char lang[ULOC_LANG_CAPACITY];
2068     UErrorCode err = U_ZERO_ERROR;
2069 
2070     if (localeID == NULL)
2071     {
2072         localeID = uloc_getDefault();
2073     }
2074     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2075     if (U_FAILURE(err))
2076         return "";
2077     offset = _findIndex(LANGUAGES, lang);
2078     if (offset < 0)
2079         return "";
2080     return LANGUAGES_3[offset];
2081 }
2082 
2083 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)2084 uloc_getISO3Country(const char* localeID)
2085 {
2086     int16_t offset;
2087     char cntry[ULOC_LANG_CAPACITY];
2088     UErrorCode err = U_ZERO_ERROR;
2089 
2090     if (localeID == NULL)
2091     {
2092         localeID = uloc_getDefault();
2093     }
2094     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2095     if (U_FAILURE(err))
2096         return "";
2097     offset = _findIndex(COUNTRIES, cntry);
2098     if (offset < 0)
2099         return "";
2100 
2101     return COUNTRIES_3[offset];
2102 }
2103 
2104 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2105 uloc_getLCID(const char* localeID)
2106 {
2107     UErrorCode status = U_ZERO_ERROR;
2108     char       langID[ULOC_FULLNAME_CAPACITY];
2109 
2110     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2111     if (U_FAILURE(status)) {
2112         return 0;
2113     }
2114 
2115     return uprv_convertToLCID(langID, localeID, &status);
2116 }
2117 
2118 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2119 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2120                 UErrorCode *status)
2121 {
2122     int32_t length;
2123     const char *posix = uprv_convertToPosix(hostid, status);
2124     if (U_FAILURE(*status) || posix == NULL) {
2125         return 0;
2126     }
2127     length = (int32_t)uprv_strlen(posix);
2128     if (length+1 > localeCapacity) {
2129         *status = U_BUFFER_OVERFLOW_ERROR;
2130     }
2131     else {
2132         uprv_strcpy(locale, posix);
2133     }
2134     return length;
2135 }
2136 
2137 /* ### Default locale **************************************************/
2138 
2139 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2140 uloc_getDefault()
2141 {
2142     return locale_get_default();
2143 }
2144 
2145 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2146 uloc_setDefault(const char*   newDefaultLocale,
2147              UErrorCode* err)
2148 {
2149     if (U_FAILURE(*err))
2150         return;
2151     /* the error code isn't currently used for anything by this function*/
2152 
2153     /* propagate change to C++ */
2154     locale_set_default(newDefaultLocale);
2155 }
2156 
2157 /**
2158  * Returns a list of all language codes defined in ISO 639.  This is a pointer
2159  * to an array of pointers to arrays of char.  All of these pointers are owned
2160  * by ICU-- do not delete them, and do not write through them.  The array is
2161  * terminated with a null pointer.
2162  */
2163 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2164 uloc_getISOLanguages()
2165 {
2166     return LANGUAGES;
2167 }
2168 
2169 /**
2170  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2171  * pointer to an array of pointers to arrays of char.  All of these pointers are
2172  * owned by ICU-- do not delete them, and do not write through them.  The array is
2173  * terminated with a null pointer.
2174  */
2175 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2176 uloc_getISOCountries()
2177 {
2178     return COUNTRIES;
2179 }
2180 
2181 
2182 /* this function to be moved into cstring.c later */
2183 static char gDecimal = 0;
2184 
2185 static /* U_CAPI */
2186 double
2187 /* U_EXPORT2 */
_uloc_strtod(const char * start,char ** end)2188 _uloc_strtod(const char *start, char **end) {
2189     char *decimal;
2190     char *myEnd;
2191     char buf[30];
2192     double rv;
2193     if (!gDecimal) {
2194         char rep[5];
2195         /* For machines that decide to change the decimal on you,
2196         and try to be too smart with localization.
2197         This normally should be just a '.'. */
2198         sprintf(rep, "%+1.1f", 1.0);
2199         gDecimal = rep[2];
2200     }
2201 
2202     if(gDecimal == '.') {
2203         return uprv_strtod(start, end); /* fall through to OS */
2204     } else {
2205         uprv_strncpy(buf, start, 29);
2206         buf[29]=0;
2207         decimal = uprv_strchr(buf, '.');
2208         if(decimal) {
2209             *decimal = gDecimal;
2210         } else {
2211             return uprv_strtod(start, end); /* no decimal point */
2212         }
2213         rv = uprv_strtod(buf, &myEnd);
2214         if(end) {
2215             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2216         }
2217         return rv;
2218     }
2219 }
2220 
2221 typedef struct {
2222     float q;
2223     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2224     char *locale;
2225 } _acceptLangItem;
2226 
2227 static int32_t U_CALLCONV
uloc_acceptLanguageCompare(const void * context,const void * a,const void * b)2228 uloc_acceptLanguageCompare(const void *context, const void *a, const void *b)
2229 {
2230     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2231     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2232 
2233     int32_t rc = 0;
2234     if(bb->q < aa->q) {
2235         rc = -1;  /* A > B */
2236     } else if(bb->q > aa->q) {
2237         rc = 1;   /* A < B */
2238     } else {
2239         rc = 0;   /* A = B */
2240     }
2241 
2242     if(rc==0) {
2243         rc = uprv_stricmp(aa->locale, bb->locale);
2244     }
2245 
2246 #if defined(ULOC_DEBUG)
2247     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2248     aa->locale, aa->q,
2249     bb->locale, bb->q,
2250     rc);*/
2251 #endif
2252 
2253     return rc;
2254 }
2255 
2256 /*
2257 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2258 */
2259 
2260 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char * httpAcceptLanguage,UEnumeration * availableLocales,UErrorCode * status)2261 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2262                             const char *httpAcceptLanguage,
2263                             UEnumeration* availableLocales,
2264                             UErrorCode *status)
2265 {
2266     _acceptLangItem *j;
2267     _acceptLangItem smallBuffer[30];
2268     char **strs;
2269     char tmp[ULOC_FULLNAME_CAPACITY +1];
2270     int32_t n = 0;
2271     const char *itemEnd;
2272     const char *paramEnd;
2273     const char *s;
2274     const char *t;
2275     int32_t res;
2276     int32_t i;
2277     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2278     int32_t jSize;
2279     char *tempstr; /* Use for null pointer check */
2280 
2281     j = smallBuffer;
2282     jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2283     if(U_FAILURE(*status)) {
2284         return -1;
2285     }
2286 
2287     for(s=httpAcceptLanguage;s&&*s;) {
2288         while(isspace(*s)) /* eat space at the beginning */
2289             s++;
2290         itemEnd=uprv_strchr(s,',');
2291         paramEnd=uprv_strchr(s,';');
2292         if(!itemEnd) {
2293             itemEnd = httpAcceptLanguage+l; /* end of string */
2294         }
2295         if(paramEnd && paramEnd<itemEnd) {
2296             /* semicolon (;) is closer than end (,) */
2297             t = paramEnd+1;
2298             if(*t=='q') {
2299                 t++;
2300             }
2301             while(isspace(*t)) {
2302                 t++;
2303             }
2304             if(*t=='=') {
2305                 t++;
2306             }
2307             while(isspace(*t)) {
2308                 t++;
2309             }
2310             j[n].q = (float)_uloc_strtod(t,NULL);
2311         } else {
2312             /* no semicolon - it's 1.0 */
2313             j[n].q = 1.0f;
2314             paramEnd = itemEnd;
2315         }
2316         j[n].dummy=0;
2317         /* eat spaces prior to semi */
2318         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2319             ;
2320         /* Check for null pointer from uprv_strndup */
2321         tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2322         if (tempstr == NULL) {
2323             *status = U_MEMORY_ALLOCATION_ERROR;
2324             return -1;
2325         }
2326         j[n].locale = tempstr;
2327         uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2328         if(strcmp(j[n].locale,tmp)) {
2329             uprv_free(j[n].locale);
2330             j[n].locale=uprv_strdup(tmp);
2331         }
2332 #if defined(ULOC_DEBUG)
2333         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2334 #endif
2335         n++;
2336         s = itemEnd;
2337         while(*s==',') { /* eat duplicate commas */
2338             s++;
2339         }
2340         if(n>=jSize) {
2341             if(j==smallBuffer) {  /* overflowed the small buffer. */
2342                 j = uprv_malloc(sizeof(j[0])*(jSize*2));
2343                 if(j!=NULL) {
2344                     uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2345                 }
2346 #if defined(ULOC_DEBUG)
2347                 fprintf(stderr,"malloced at size %d\n", jSize);
2348 #endif
2349             } else {
2350                 j = uprv_realloc(j, sizeof(j[0])*jSize*2);
2351 #if defined(ULOC_DEBUG)
2352                 fprintf(stderr,"re-alloced at size %d\n", jSize);
2353 #endif
2354             }
2355             jSize *= 2;
2356             if(j==NULL) {
2357                 *status = U_MEMORY_ALLOCATION_ERROR;
2358                 return -1;
2359             }
2360         }
2361     }
2362     uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2363     if(U_FAILURE(*status)) {
2364         if(j != smallBuffer) {
2365 #if defined(ULOC_DEBUG)
2366             fprintf(stderr,"freeing j %p\n", j);
2367 #endif
2368             uprv_free(j);
2369         }
2370         return -1;
2371     }
2372     strs = uprv_malloc((size_t)(sizeof(strs[0])*n));
2373     /* Check for null pointer */
2374     if (strs == NULL) {
2375         uprv_free(j); /* Free to avoid memory leak */
2376         *status = U_MEMORY_ALLOCATION_ERROR;
2377         return -1;
2378     }
2379     for(i=0;i<n;i++) {
2380 #if defined(ULOC_DEBUG)
2381         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2382 #endif
2383         strs[i]=j[i].locale;
2384     }
2385     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2386         (const char**)strs, n, availableLocales, status);
2387     for(i=0;i<n;i++) {
2388         uprv_free(strs[i]);
2389     }
2390     uprv_free(strs);
2391     if(j != smallBuffer) {
2392 #if defined(ULOC_DEBUG)
2393         fprintf(stderr,"freeing j %p\n", j);
2394 #endif
2395         uprv_free(j);
2396     }
2397     return res;
2398 }
2399 
2400 
2401 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char ** acceptList,int32_t acceptListCount,UEnumeration * availableLocales,UErrorCode * status)2402 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2403                     UAcceptResult *outResult, const char **acceptList,
2404                     int32_t acceptListCount,
2405                     UEnumeration* availableLocales,
2406                     UErrorCode *status)
2407 {
2408     int32_t i,j;
2409     int32_t len;
2410     int32_t maxLen=0;
2411     char tmp[ULOC_FULLNAME_CAPACITY+1];
2412     const char *l;
2413     char **fallbackList;
2414     if(U_FAILURE(*status)) {
2415         return -1;
2416     }
2417     fallbackList = uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount));
2418     if(fallbackList==NULL) {
2419         *status = U_MEMORY_ALLOCATION_ERROR;
2420         return -1;
2421     }
2422     for(i=0;i<acceptListCount;i++) {
2423 #if defined(ULOC_DEBUG)
2424         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2425 #endif
2426         while((l=uenum_next(availableLocales, NULL, status))) {
2427 #if defined(ULOC_DEBUG)
2428             fprintf(stderr,"  %s\n", l);
2429 #endif
2430             len = (int32_t)uprv_strlen(l);
2431             if(!uprv_strcmp(acceptList[i], l)) {
2432                 if(outResult) {
2433                     *outResult = ULOC_ACCEPT_VALID;
2434                 }
2435 #if defined(ULOC_DEBUG)
2436                 fprintf(stderr, "MATCH! %s\n", l);
2437 #endif
2438                 if(len>0) {
2439                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2440                 }
2441                 for(j=0;j<i;j++) {
2442                     uprv_free(fallbackList[j]);
2443                 }
2444                 uprv_free(fallbackList);
2445                 return u_terminateChars(result, resultAvailable, len, status);
2446             }
2447             if(len>maxLen) {
2448                 maxLen = len;
2449             }
2450         }
2451         uenum_reset(availableLocales, status);
2452         /* save off parent info */
2453         if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2454             fallbackList[i] = uprv_strdup(tmp);
2455         } else {
2456             fallbackList[i]=0;
2457         }
2458     }
2459 
2460     for(maxLen--;maxLen>0;maxLen--) {
2461         for(i=0;i<acceptListCount;i++) {
2462             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2463 #if defined(ULOC_DEBUG)
2464                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2465 #endif
2466                 while((l=uenum_next(availableLocales, NULL, status))) {
2467 #if defined(ULOC_DEBUG)
2468                     fprintf(stderr,"  %s\n", l);
2469 #endif
2470                     len = (int32_t)uprv_strlen(l);
2471                     if(!uprv_strcmp(fallbackList[i], l)) {
2472                         if(outResult) {
2473                             *outResult = ULOC_ACCEPT_FALLBACK;
2474                         }
2475 #if defined(ULOC_DEBUG)
2476                         fprintf(stderr, "fallback MATCH! %s\n", l);
2477 #endif
2478                         if(len>0) {
2479                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2480                         }
2481                         for(j=0;j<acceptListCount;j++) {
2482                             uprv_free(fallbackList[j]);
2483                         }
2484                         uprv_free(fallbackList);
2485                         return u_terminateChars(result, resultAvailable, len, status);
2486                     }
2487                 }
2488                 uenum_reset(availableLocales, status);
2489 
2490                 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2491                     uprv_free(fallbackList[i]);
2492                     fallbackList[i] = uprv_strdup(tmp);
2493                 } else {
2494                     uprv_free(fallbackList[i]);
2495                     fallbackList[i]=0;
2496                 }
2497             }
2498         }
2499         if(outResult) {
2500             *outResult = ULOC_ACCEPT_FAILED;
2501         }
2502     }
2503     for(i=0;i<acceptListCount;i++) {
2504         uprv_free(fallbackList[i]);
2505     }
2506     uprv_free(fallbackList);
2507     return -1;
2508 }
2509 
2510 /*eof*/
2511