1 /*
2 **********************************************************************
3 * Copyright (C) 1997-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File ULOC.CPP
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 04/01/97 aliu Creation.
13 * 08/21/98 stephen JDK 1.2 sync
14 * 12/08/98 rtg New Locale implementation and C API
15 * 03/15/99 damiba overhaul.
16 * 04/06/99 stephen changed setDefault() to realloc and copy
17 * 06/14/99 stephen Changed calls to ures_open for new params
18 * 07/21/99 stephen Modified setDefault() to propagate to C++
19 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
20 * brought canonicalization code into line with spec
21 *****************************************************************************/
22
23 /*
24 POSIX's locale format, from putil.c: [no spaces]
25
26 ll [ _CC ] [ . MM ] [ @ VV]
27
28 l = lang, C = ctry, M = charmap, V = variant
29 */
30
31 #include "unicode/utypes.h"
32 #include "unicode/ustring.h"
33 #include "unicode/uloc.h"
34
35 #include "putilimp.h"
36 #include "ustr_imp.h"
37 #include "ulocimp.h"
38 #include "umutex.h"
39 #include "cstring.h"
40 #include "cmemory.h"
41 #include "ucln_cmn.h"
42 #include "locmap.h"
43 #include "uarrsort.h"
44 #include "uenumimp.h"
45 #include "uassert.h"
46
47 #include <stdio.h> /* for sprintf */
48
49 /* ### Declarations **************************************************/
50
51 /* Locale stuff from locid.cpp */
52 U_CFUNC void locale_set_default(const char *id);
53 U_CFUNC const char *locale_get_default(void);
54 U_CFUNC int32_t
55 locale_getKeywords(const char *localeID,
56 char prev,
57 char *keywords, int32_t keywordCapacity,
58 char *values, int32_t valuesCapacity, int32_t *valLen,
59 UBool valuesToo,
60 UErrorCode *status);
61
62 /* ### Data tables **************************************************/
63
64 /**
65 * Table of language codes, both 2- and 3-letter, with preference
66 * given to 2-letter codes where possible. Includes 3-letter codes
67 * that lack a 2-letter equivalent.
68 *
69 * This list must be in sorted order. This list is returned directly
70 * to the user by some API.
71 *
72 * This list must be kept in sync with LANGUAGES_3, with corresponding
73 * entries matched.
74 *
75 * This table should be terminated with a NULL entry, followed by a
76 * second list, and another NULL entry. The first list is visible to
77 * user code when this array is returned by API. The second list
78 * contains codes we support, but do not expose through user API.
79 *
80 * Notes
81 *
82 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
83 * include the revisions up to 2001/7/27 *CWB*
84 *
85 * The 3 character codes are the terminology codes like RFC 3066. This
86 * is compatible with prior ICU codes
87 *
88 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
89 * table but now at the end of the table because 3 character codes are
90 * duplicates. This avoids bad searches going from 3 to 2 character
91 * codes.
92 *
93 * The range qaa-qtz is reserved for local use
94 */
95 static const char * const LANGUAGES[] = {
96 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "af", "afa",
97 "afh", "agq", "ain", "ak", "akk", "ale", "alg", "alt", "am", "an",
98 "ang", "anp", "apa",
99 "ar", "arc", "arn", "arp", "art", "arw", "as", "asa", "ast",
100 "ath", "aus", "av", "awa", "ay", "az", "ba", "bad",
101 "bai", "bal", "ban", "bas", "bat", "be", "bej",
102 "bem", "ber", "bez", "bg", "bh", "bho", "bi", "bik", "bin",
103 "bla", "bm", "bn", "bnt", "bo", "br", "bra", "brx", "bs",
104 "btk", "bua", "bug", "byn", "ca", "cad", "cai", "car", "cau",
105 "cch", "ce", "ceb", "cel", "cgg", "ch", "chb", "chg", "chk", "chm",
106 "chn", "cho", "chp", "chr", "chy", "cmc", "co", "cop",
107 "cpe", "cpf", "cpp", "cr", "crh", "crp", "cs", "csb", "cu", "cus",
108 "cv", "cy", "da", "dak", "dar", "dav", "day", "de", "del", "den",
109 "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "dv", "dyo", "dyu",
110 "dz", "ebu", "ee", "efi", "egy", "eka", "el", "elx", "en",
111 "enm", "eo", "es", "et", "eu", "ewo", "fa",
112 "fan", "fat", "ff", "fi", "fil", "fiu", "fj", "fo", "fon",
113 "fr", "frm", "fro", "frr", "frs", "fur", "fy",
114 "ga", "gaa", "gay", "gba", "gd", "gem", "gez", "gil",
115 "gl", "gmh", "gn", "goh", "gon", "gor", "got", "grb",
116 "grc", "gsw", "gu", "guz", "gv", "gwi",
117 "ha", "hai", "haw", "he", "hi", "hil", "him",
118 "hit", "hmn", "ho", "hr", "hsb", "ht", "hu", "hup", "hy", "hz",
119 "ia", "iba", "id", "ie", "ig", "ii", "ijo", "ik",
120 "ilo", "inc", "ine", "inh", "io", "ira", "iro", "is", "it",
121 "iu", "ja", "jbo", "jmc", "jpr", "jrb", "jv", "ka", "kaa", "kab",
122 "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg", "kha", "khi",
123 "kho", "khq", "ki", "kj", "kk", "kl", "kln", "km", "kmb", "kn",
124 "ko", "kok", "kos", "kpe", "kr", "krc", "krl", "kro", "kru", "ks", "ksb", "ksf",
125 "ku", "kum", "kut", "kv", "kw", "ky", "la", "lad", "lag",
126 "lah", "lam", "lb", "lez", "lg", "li", "ln", "lo", "lol",
127 "loz", "lt", "lu", "lua", "lui", "lun", "luo", "lus", "luy",
128 "lv", "mad", "mag", "mai", "mak", "man", "map", "mas",
129 "mdf", "mdr", "men", "mer", "mfe", "mg", "mga", "mgh", "mh", "mi", "mic", "min",
130 "mis", "mk", "mkh", "ml", "mn", "mnc", "mni", "mno",
131 "mo", "moh", "mos", "mr", "ms", "mt", "mua", "mul", "mun",
132 "mus", "mwl", "mwr", "my", "myn", "myv", "na", "nah", "nai", "nap", "naq",
133 "nb", "nd", "nds", "ne", "new", "ng", "nia", "nic",
134 "niu", "nl", "nmg", "nn", "no", "nog", "non", "nqo", "nr", "nso", "nub", "nus",
135 "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi", "oc", "oj",
136 "om", "or", "os", "osa", "ota", "oto", "pa", "paa",
137 "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
138 "pi", "pl", "pon", "pra", "pro", "ps", "pt", "qu",
139 "raj", "rap", "rar", "rm", "rn", "ro", "roa", "rof", "rom",
140 "ru", "rup", "rw", "rwk", "sa", "sad", "sah", "sai", "sal", "sam", "saq",
141 "sas", "sat", "sbp", "sc", "scn", "sco", "sd", "se", "seh", "sel", "sem", "ses",
142 "sg", "sga", "sgn", "shi", "shn", "si", "sid", "sio", "sit",
143 "sk", "sl", "sla", "sm", "sma", "smi", "smj", "smn",
144 "sms", "sn", "snk", "so", "sog", "son", "sq", "sr",
145 "srn", "srr", "ss", "ssa", "st", "su", "suk", "sus", "sux",
146 "sv", "sw", "swc", "syc", "syr", "ta", "tai", "te", "tem", "teo", "ter",
147 "tet", "tg", "th", "ti", "tig", "tiv", "tk", "tkl",
148 "tl", "tlh", "tli", "tmh", "tn", "to", "tog", "tpi", "tr", "trv",
149 "ts", "tsi", "tt", "tum", "tup", "tut", "tvl", "tw", "twq",
150 "ty", "tyv", "tzm", "udm", "ug", "uga", "uk", "umb", "und", "ur",
151 "uz", "vai", "ve", "vi", "vo", "vot", "vun", "wa", "wak",
152 "wal", "war", "was", "wen", "wo", "xal", "xh", "xog", "yao", "yap", "yav",
153 "yi", "yo", "ypk", "za", "zap", "zbl", "zen", "zh", "znd",
154 "zu", "zun", "zxx", "zza",
155 NULL,
156 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
157 NULL
158 };
159 static const char* const DEPRECATED_LANGUAGES[]={
160 "in", "iw", "ji", "jw", NULL, NULL
161 };
162 static const char* const REPLACEMENT_LANGUAGES[]={
163 "id", "he", "yi", "jv", NULL, NULL
164 };
165
166 /**
167 * Table of 3-letter language codes.
168 *
169 * This is a lookup table used to convert 3-letter language codes to
170 * their 2-letter equivalent, where possible. It must be kept in sync
171 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
172 * same language as LANGUAGES_3[i]. The commented-out lines are
173 * copied from LANGUAGES to make eyeballing this baby easier.
174 *
175 * Where a 3-letter language code has no 2-letter equivalent, the
176 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
177 *
178 * This table should be terminated with a NULL entry, followed by a
179 * second list, and another NULL entry. The two lists correspond to
180 * the two lists in LANGUAGES.
181 */
182 static const char * const LANGUAGES_3[] = {
183 /* "aa", "ab", "ace", "ach", "ada", "ady", "ae", "af", "afa", */
184 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr", "afa",
185 /* "afh", "agq", "ain", "ak", "akk", "ale", "alg", "alt", "am", "an", "ang", "anp", "apa", */
186 "afh", "agq", "ain", "aka", "akk", "ale", "alg", "alt", "amh", "arg", "ang", "anp", "apa",
187 /* "ar", "arc", "arn", "arp", "art", "arw", "as", "asa", "ast", */
188 "ara", "arc", "arn", "arp", "art", "arw", "asm", "asa", "ast",
189 /* "ath", "aus", "av", "awa", "ay", "az", "ba", "bad", */
190 "ath", "aus", "ava", "awa", "aym", "aze", "bak", "bad",
191 /* "bai", "bal", "ban", "bas", "bat", "be", "bej", */
192 "bai", "bal", "ban", "bas", "bat", "bel", "bej",
193 /* "bem", "ber", "bez", "bg", "bh", "bho", "bi", "bik", "bin", */
194 "bem", "ber", "bez", "bul", "bih", "bho", "bis", "bik", "bin",
195 /* "bla", "bm", "bn", "bnt", "bo", "br", "bra", "brx", "bs", */
196 "bla", "bam", "ben", "bnt", "bod", "bre", "bra", "brx", "bos",
197 /* "btk", "bua", "bug", "byn", "ca", "cad", "cai", "car", "cau", */
198 "btk", "bua", "bug", "byn", "cat", "cad", "cai", "car", "cau",
199 /* "cch", "ce", "ceb", "cel", "cgg", "ch", "chb", "chg", "chk", "chm", */
200 "cch", "che", "ceb", "cel", "cgg", "cha", "chb", "chg", "chk", "chm",
201 /* "chn", "cho", "chp", "chr", "chy", "cmc", "co", "cop", */
202 "chn", "cho", "chp", "chr", "chy", "cmc", "cos", "cop",
203 /* "cpe", "cpf", "cpp", "cr", "crh", "crp", "cs", "csb", "cu", "cus", */
204 "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces", "csb", "chu", "cus",
205 /* "cv", "cy", "da", "dak", "dar", "dav", "day", "de", "del", "den", */
206 "chv", "cym", "dan", "dak", "dar", "dav", "day", "deu", "del", "den",
207 /* "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "dv", "dyo", "dyu", */
208 "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "div", "dyo", "dyu",
209 /* "dz", "ebu", "ee", "efi", "egy", "eka", "el", "elx", "en", */
210 "dzo", "ebu", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
211 /* "enm", "eo", "es", "et", "eu", "ewo", "fa", */
212 "enm", "epo", "spa", "est", "eus", "ewo", "fas",
213 /* "fan", "fat", "ff", "fi", "fil", "fiu", "fj", "fo", "fon", */
214 "fan", "fat", "ful", "fin", "fil", "fiu", "fij", "fao", "fon",
215 /* "fr", "frm", "fro", "frr", "frs", "fur", "fy", "ga", "gaa", "gay", */
216 "fra", "frm", "fro", "frr", "frs", "fur", "fry", "gle", "gaa", "gay",
217 /* "gba", "gd", "gem", "gez", "gil", "gl", "gmh", "gn", */
218 "gba", "gla", "gem", "gez", "gil", "glg", "gmh", "grn",
219 /* "goh", "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guz", "gv", */
220 "goh", "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guz", "glv",
221 /* "gwi", "ha", "hai", "haw", "he", "hi", "hil", "him", */
222 "gwi", "hau", "hai", "haw", "heb", "hin", "hil", "him",
223 /* "hit", "hmn", "ho", "hr", "hsb", "ht", "hu", "hup", "hy", "hz", */
224 "hit", "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye", "her",
225 /* "ia", "iba", "id", "ie", "ig", "ii", "ijo", "ik", */
226 "ina", "iba", "ind", "ile", "ibo", "iii", "ijo", "ipk",
227 /* "ilo", "inc", "ine", "inh", "io", "ira", "iro", "is", "it", */
228 "ilo", "inc", "ine", "inh", "ido", "ira", "iro", "isl", "ita",
229 /* "iu", "ja", "jbo", "jmc", "jpr", "jrb", "jv", "ka", "kaa", "kab", */
230 "iku", "jpn", "jbo", "jmc", "jpr", "jrb", "jav", "kat", "kaa", "kab",
231 /* "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg", "kha", "khi",*/
232 "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg", "kha", "khi",
233 /* "kho", "khq", "ki", "kj", "kk", "kl", "kln", "km", "kmb", "kn", */
234 "kho", "khq", "kik", "kua", "kaz", "kal", "kln", "khm", "kmb", "kan",
235 /* "ko", "kok", "kos", "kpe", "kr", "krc", "krl", "kro", "kru", "ks", "ksb", "ksf", */
236 "kor", "kok", "kos", "kpe", "kau", "krc", "krl", "kro", "kru", "kas", "ksb", "ksf",
237 /* "ku", "kum", "kut", "kv", "kw", "ky", "la", "lad", "lag", */
238 "kur", "kum", "kut", "kom", "cor", "kir", "lat", "lad", "lag",
239 /* "lah", "lam", "lb", "lez", "lg", "li", "ln", "lo", "lol", */
240 "lah", "lam", "ltz", "lez", "lug", "lim", "lin", "lao", "lol",
241 /* "loz", "lt", "lu", "lua", "lui", "lun", "luo", "lus", "luy", */
242 "loz", "lit", "lub", "lua", "lui", "lun", "luo", "lus", "luy",
243 /* "lv", "mad", "mag", "mai", "mak", "man", "map", "mas", */
244 "lav", "mad", "mag", "mai", "mak", "man", "map", "mas",
245 /* "mdf", "mdr", "men", "mer", "mfe", "mg", "mga", "mgh", "mh", "mi", "mic", "min", */
246 "mdf", "mdr", "men", "mer", "mfe", "mlg", "mga", "mgh", "mah", "mri", "mic", "min",
247 /* "mis", "mk", "mkh", "ml", "mn", "mnc", "mni", "mno", */
248 "mis", "mkd", "mkh", "mal", "mon", "mnc", "mni", "mno",
249 /* "mo", "moh", "mos", "mr", "ms", "mt", "mua", "mul", "mun", */
250 "mol", "moh", "mos", "mar", "msa", "mlt", "mua", "mul", "mun",
251 /* "mus", "mwl", "mwr", "my", "myn", "myv", "na", "nah", "nai", "nap", "naq", */
252 "mus", "mwl", "mwr", "mya", "myn", "myv", "nau", "nah", "nai", "nap", "naq",
253 /* "nb", "nd", "nds", "ne", "new", "ng", "nia", "nic", */
254 "nob", "nde", "nds", "nep", "new", "ndo", "nia", "nic",
255 /* "niu", "nl", "nmg", "nn", "no", "nog", "non", "nqo", "nr", "nso", "nub", "nus", */
256 "niu", "nld", "nmg", "nno", "nor", "nog", "non", "nqo", "nbl", "nso", "nub", "nus",
257 /* "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi", "oc", "oj", */
258 "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", "oci", "oji",
259 /* "om", "or", "os", "osa", "ota", "oto", "pa", "paa", */
260 "orm", "ori", "oss", "osa", "ota", "oto", "pan", "paa",
261 /* "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn", */
262 "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
263 /* "pi", "pl", "pon", "pra", "pro", "ps", "pt", "qu", */
264 "pli", "pol", "pon", "pra", "pro", "pus", "por", "que",
265 /* "raj", "rap", "rar", "rm", "rn", "ro", "roa", "rof", "rom", */
266 "raj", "rap", "rar", "roh", "run", "ron", "roa", "rof", "rom",
267 /* "ru", "rup", "rw", "rwk", "sa", "sad", "sah", "sai", "sal", "sam", "saq", */
268 "rus", "rup", "kin", "rwk", "san", "sad", "sah", "sai", "sal", "sam", "saq",
269 /* "sas", "sat", "sbp", "sc", "scn", "sco", "sd", "se", "seh", "sel", "sem", "ses", */
270 "sas", "sat", "sbp", "srd", "scn", "sco", "snd", "sme", "seh", "sel", "sem", "ses",
271 /* "sg", "sga", "sgn", "shi", "shn", "si", "sid", "sio", "sit", */
272 "sag", "sga", "sgn", "shi", "shn", "sin", "sid", "sio", "sit",
273 /* "sk", "sl", "sla", "sm", "sma", "smi", "smj", "smn", */
274 "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
275 /* "sms", "sn", "snk", "so", "sog", "son", "sq", "sr", */
276 "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
277 /* "srn", "srr", "ss", "ssa", "st", "su", "suk", "sus", "sux", */
278 "srn", "srr", "ssw", "ssa", "sot", "sun", "suk", "sus", "sux",
279 /* "sv", "sw", "swc", "syc", "syr", "ta", "tai", "te", "tem", "teo", "ter", */
280 "swe", "swa", "swc", "syc", "syr", "tam", "tai", "tel", "tem", "teo", "ter",
281 /* "tet", "tg", "th", "ti", "tig", "tiv", "tk", "tkl", */
282 "tet", "tgk", "tha", "tir", "tig", "tiv", "tuk", "tkl",
283 /* "tl", "tlh", "tli", "tmh", "tn", "to", "tog", "tpi", "tr", "trv", */
284 "tgl", "tlh", "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
285 /* "ts", "tsi", "tt", "tum", "tup", "tut", "tvl", "tw", "twq" */
286 "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi", "twq",
287 /* "ty", "tyv", "tzm", "udm", "ug", "uga", "uk", "umb", "und", "ur", */
288 "tah", "tyv", "tzm", "udm", "uig", "uga", "ukr", "umb", "und", "urd",
289 /* "uz", "vai", "ve", "vi", "vo", "vot", "vun", "wa", "wak", */
290 "uzb", "vai", "ven", "vie", "vol", "vot", "vun", "wln", "wak",
291 /* "wal", "war", "was", "wen", "wo", "xal", "xh", "xog", "yao", "yap", "yav", */
292 "wal", "war", "was", "wen", "wol", "xal", "xho", "xog", "yao", "yap", "yav",
293 /* "yi", "yo", "ypk", "za", "zap", "zbl", "zen", "zh", "znd", */
294 "yid", "yor", "ypk", "zha", "zap", "zbl", "zen", "zho", "znd",
295 /* "zu", "zun", "zxx", "zza", */
296 "zul", "zun", "zxx", "zza",
297 NULL,
298 /* "in", "iw", "ji", "jw", "sh", */
299 "ind", "heb", "yid", "jaw", "srp",
300 NULL
301 };
302
303 /**
304 * Table of 2-letter country codes.
305 *
306 * This list must be in sorted order. This list is returned directly
307 * to the user by some API.
308 *
309 * This list must be kept in sync with COUNTRIES_3, with corresponding
310 * entries matched.
311 *
312 * This table should be terminated with a NULL entry, followed by a
313 * second list, and another NULL entry. The first list is visible to
314 * user code when this array is returned by API. The second list
315 * contains codes we support, but do not expose through user API.
316 *
317 * Notes:
318 *
319 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
320 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
321 * new codes keeping the old ones for compatibility updated to include
322 * 1999/12/03 revisions *CWB*
323 *
324 * RO(ROM) is now RO(ROU) according to
325 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
326 */
327 static const char * const COUNTRIES[] = {
328 "AD", "AE", "AF", "AG", "AI", "AL", "AM", "AN",
329 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
330 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
331 "BJ", "BL", "BM", "BN", "BO", "BR", "BS", "BT", "BV",
332 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
333 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR",
334 "CU", "CV", "CX", "CY", "CZ", "DE", "DJ", "DK",
335 "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER",
336 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
337 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
338 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
339 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
340 "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
341 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
342 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
343 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
344 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
345 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
346 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
347 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
348 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
349 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
350 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
351 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
352 "SK", "SL", "SM", "SN", "SO", "SR", "ST", "SV",
353 "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
354 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
355 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
356 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
357 "WS", "YE", "YT", "ZA", "ZM", "ZW",
358 NULL,
359 "FX", "CS", "RO", "TP", "YU", "ZR", /* obsolete country codes */
360 NULL
361 };
362
363 static const char* const DEPRECATED_COUNTRIES[] ={
364 "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR", NULL, NULL /* deprecated country list */
365 };
366 static const char* const REPLACEMENT_COUNTRIES[] = {
367 /* "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR" */
368 "MM", "RS", "BJ", "FR", "BF", "VU", "ZW", "TL", "RS", "CD", NULL, NULL /* replacement country codes */
369 };
370
371 /**
372 * Table of 3-letter country codes.
373 *
374 * This is a lookup table used to convert 3-letter country codes to
375 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
376 * For all valid i, COUNTRIES[i] must refer to the same country as
377 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
378 * to make eyeballing this baby easier.
379 *
380 * This table should be terminated with a NULL entry, followed by a
381 * second list, and another NULL entry. The two lists correspond to
382 * the two lists in COUNTRIES.
383 */
384 static const char * const COUNTRIES_3[] = {
385 /* "AD", "AE", "AF", "AG", "AI", "AL", "AM", "AN", */
386 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", "ANT",
387 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
388 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
389 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
390 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
391 /* "BJ", "BL", "BM", "BN", "BO", "BR", "BS", "BT", "BV", */
392 "BEN", "BLM", "BMU", "BRN", "BOL", "BRA", "BHS", "BTN", "BVT",
393 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
394 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
395 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR", */
396 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
397 /* "CU", "CV", "CX", "CY", "CZ", "DE", "DJ", "DK", */
398 "CUB", "CPV", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
399 /* "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", */
400 "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
401 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
402 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
403 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
404 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
405 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
406 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
407 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
408 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
409 /* "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
410 "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
411 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
412 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
413 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
414 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
415 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
416 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
417 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
418 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
419 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
420 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
421 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
422 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
423 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
424 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
425 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
426 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
427 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
428 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
429 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
430 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
431 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
432 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
433 /* "SK", "SL", "SM", "SN", "SO", "SR", "ST", "SV", */
434 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "STP", "SLV",
435 /* "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
436 "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
437 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
438 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
439 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
440 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
441 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
442 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
443 /* "WS", "YE", "YT", "ZA", "ZM", "ZW", */
444 "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
445 NULL,
446 /* "FX", "CS", "RO", "TP", "YU", "ZR", */
447 "FXX", "SCG", "ROM", "TMP", "YUG", "ZAR",
448 NULL
449 };
450
451 typedef struct CanonicalizationMap {
452 const char *id; /* input ID */
453 const char *canonicalID; /* canonicalized output ID */
454 const char *keyword; /* keyword, or NULL if none */
455 const char *value; /* keyword value, or NULL if kw==NULL */
456 } CanonicalizationMap;
457
458 /**
459 * A map to canonicalize locale IDs. This handles a variety of
460 * different semantic kinds of transformations.
461 */
462 static const CanonicalizationMap CANONICALIZE_MAP[] = {
463 { "", "en_US_POSIX", NULL, NULL }, /* .NET name */
464 { "c", "en_US_POSIX", NULL, NULL }, /* POSIX name */
465 { "posix", "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
466 { "art_LOJBAN", "jbo", NULL, NULL }, /* registered name */
467 { "az_AZ_CYRL", "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
468 { "az_AZ_LATN", "az_Latn_AZ", NULL, NULL }, /* .NET name */
469 { "ca_ES_PREEURO", "ca_ES", "currency", "ESP" },
470 { "cel_GAULISH", "cel__GAULISH", NULL, NULL }, /* registered name */
471 { "de_1901", "de__1901", NULL, NULL }, /* registered name */
472 { "de_1906", "de__1906", NULL, NULL }, /* registered name */
473 { "de__PHONEBOOK", "de", "collation", "phonebook" }, /* Old ICU name */
474 { "de_AT_PREEURO", "de_AT", "currency", "ATS" },
475 { "de_DE_PREEURO", "de_DE", "currency", "DEM" },
476 { "de_LU_PREEURO", "de_LU", "currency", "LUF" },
477 { "el_GR_PREEURO", "el_GR", "currency", "GRD" },
478 { "en_BOONT", "en__BOONT", NULL, NULL }, /* registered name */
479 { "en_SCOUSE", "en__SCOUSE", NULL, NULL }, /* registered name */
480 { "en_BE_PREEURO", "en_BE", "currency", "BEF" },
481 { "en_IE_PREEURO", "en_IE", "currency", "IEP" },
482 { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
483 { "es_ES_PREEURO", "es_ES", "currency", "ESP" },
484 { "eu_ES_PREEURO", "eu_ES", "currency", "ESP" },
485 { "fi_FI_PREEURO", "fi_FI", "currency", "FIM" },
486 { "fr_BE_PREEURO", "fr_BE", "currency", "BEF" },
487 { "fr_FR_PREEURO", "fr_FR", "currency", "FRF" },
488 { "fr_LU_PREEURO", "fr_LU", "currency", "LUF" },
489 { "ga_IE_PREEURO", "ga_IE", "currency", "IEP" },
490 { "gl_ES_PREEURO", "gl_ES", "currency", "ESP" },
491 { "hi__DIRECT", "hi", "collation", "direct" }, /* Old ICU name */
492 { "it_IT_PREEURO", "it_IT", "currency", "ITL" },
493 { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
494 { "nb_NO_NY", "nn_NO", NULL, NULL }, /* "markus said this was ok" :-) */
495 { "nl_BE_PREEURO", "nl_BE", "currency", "BEF" },
496 { "nl_NL_PREEURO", "nl_NL", "currency", "NLG" },
497 { "pt_PT_PREEURO", "pt_PT", "currency", "PTE" },
498 { "sl_ROZAJ", "sl__ROZAJ", NULL, NULL }, /* registered name */
499 { "sr_SP_CYRL", "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
500 { "sr_SP_LATN", "sr_Latn_RS", NULL, NULL }, /* .NET name */
501 { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
502 { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
503 { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
504 { "uz_UZ_CYRL", "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
505 { "uz_UZ_LATN", "uz_Latn_UZ", NULL, NULL }, /* .NET name */
506 { "zh_CHS", "zh_Hans", NULL, NULL }, /* .NET name */
507 { "zh_CHT", "zh_Hant", NULL, NULL }, /* .NET name */
508 { "zh_GAN", "zh__GAN", NULL, NULL }, /* registered name */
509 { "zh_GUOYU", "zh", NULL, NULL }, /* registered name */
510 { "zh_HAKKA", "zh__HAKKA", NULL, NULL }, /* registered name */
511 { "zh_MIN", "zh__MIN", NULL, NULL }, /* registered name */
512 { "zh_MIN_NAN", "zh__MINNAN", NULL, NULL }, /* registered name */
513 { "zh_WUU", "zh__WUU", NULL, NULL }, /* registered name */
514 { "zh_XIANG", "zh__XIANG", NULL, NULL }, /* registered name */
515 { "zh_YUE", "zh__YUE", NULL, NULL }, /* registered name */
516 };
517
518 typedef struct VariantMap {
519 const char *variant; /* input ID */
520 const char *keyword; /* keyword, or NULL if none */
521 const char *value; /* keyword value, or NULL if kw==NULL */
522 } VariantMap;
523
524 static const VariantMap VARIANT_MAP[] = {
525 { "EURO", "currency", "EUR" },
526 { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
527 { "STROKE", "collation", "stroke" } /* Solaris variant */
528 };
529
530 /* ### BCP47 Conversion *******************************************/
531 /* Test if the locale id has BCP47 u extension and does not have '@' */
532 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
533 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
534 #define _ConvertBCP47(finalID, id, buffer, length,err) \
535 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
536 finalID=id; \
537 } else { \
538 finalID=buffer; \
539 }
540 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)541 static int32_t getShortestSubtagLength(const char *localeID) {
542 int32_t localeIDLength = uprv_strlen(localeID);
543 int32_t length = localeIDLength;
544 int32_t tmpLength = 0;
545 int32_t i;
546 UBool reset = TRUE;
547
548 for (i = 0; i < localeIDLength; i++) {
549 if (localeID[i] != '_' && localeID[i] != '-') {
550 if (reset) {
551 tmpLength = 0;
552 reset = FALSE;
553 }
554 tmpLength++;
555 } else {
556 if (tmpLength != 0 && tmpLength < length) {
557 length = tmpLength;
558 }
559 reset = TRUE;
560 }
561 }
562
563 return length;
564 }
565
566 /* ### Keywords **************************************************/
567
568 #define ULOC_KEYWORD_BUFFER_LEN 25
569 #define ULOC_MAX_NO_KEYWORDS 25
570
571 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)572 locale_getKeywordsStart(const char *localeID) {
573 const char *result = NULL;
574 if((result = uprv_strchr(localeID, '@')) != NULL) {
575 return result;
576 }
577 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
578 else {
579 /* We do this because the @ sign is variant, and the @ sign used on one
580 EBCDIC machine won't be compiled the same way on other EBCDIC based
581 machines. */
582 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
583 const uint8_t *charToFind = ebcdicSigns;
584 while(*charToFind) {
585 if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
586 return result;
587 }
588 charToFind++;
589 }
590 }
591 #endif
592 return NULL;
593 }
594
595 /**
596 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
597 * @param keywordName incoming name to be canonicalized
598 * @param status return status (keyword too long)
599 * @return length of the keyword name
600 */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)601 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
602 {
603 int32_t i;
604 int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
605
606 if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
607 /* keyword name too long for internal buffer */
608 *status = U_INTERNAL_PROGRAM_ERROR;
609 return 0;
610 }
611
612 /* normalize the keyword name */
613 for(i = 0; i < keywordNameLen; i++) {
614 buf[i] = uprv_tolower(keywordName[i]);
615 }
616 buf[i] = 0;
617
618 return keywordNameLen;
619 }
620
621 typedef struct {
622 char keyword[ULOC_KEYWORD_BUFFER_LEN];
623 int32_t keywordLen;
624 const char *valueStart;
625 int32_t valueLen;
626 } KeywordStruct;
627
628 static int32_t U_CALLCONV
compareKeywordStructs(const void * context,const void * left,const void * right)629 compareKeywordStructs(const void *context, const void *left, const void *right) {
630 const char* leftString = ((const KeywordStruct *)left)->keyword;
631 const char* rightString = ((const KeywordStruct *)right)->keyword;
632 return uprv_strcmp(leftString, rightString);
633 }
634
635 /**
636 * Both addKeyword and addValue must already be in canonical form.
637 * Either both addKeyword and addValue are NULL, or neither is NULL.
638 * If they are not NULL they must be zero terminated.
639 * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
640 */
641 static int32_t
_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,const char * addKeyword,const char * addValue,UErrorCode * status)642 _getKeywords(const char *localeID,
643 char prev,
644 char *keywords, int32_t keywordCapacity,
645 char *values, int32_t valuesCapacity, int32_t *valLen,
646 UBool valuesToo,
647 const char* addKeyword,
648 const char* addValue,
649 UErrorCode *status)
650 {
651 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
652
653 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
654 int32_t numKeywords = 0;
655 const char* pos = localeID;
656 const char* equalSign = NULL;
657 const char* semicolon = NULL;
658 int32_t i = 0, j, n;
659 int32_t keywordsLen = 0;
660 int32_t valuesLen = 0;
661
662 if(prev == '@') { /* start of keyword definition */
663 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
664 do {
665 UBool duplicate = FALSE;
666 /* skip leading spaces */
667 while(*pos == ' ') {
668 pos++;
669 }
670 if (!*pos) { /* handle trailing "; " */
671 break;
672 }
673 if(numKeywords == maxKeywords) {
674 *status = U_INTERNAL_PROGRAM_ERROR;
675 return 0;
676 }
677 equalSign = uprv_strchr(pos, '=');
678 semicolon = uprv_strchr(pos, ';');
679 /* lack of '=' [foo@currency] is illegal */
680 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
681 if(!equalSign || (semicolon && semicolon<equalSign)) {
682 *status = U_INVALID_FORMAT_ERROR;
683 return 0;
684 }
685 /* need to normalize both keyword and keyword name */
686 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
687 /* keyword name too long for internal buffer */
688 *status = U_INTERNAL_PROGRAM_ERROR;
689 return 0;
690 }
691 for(i = 0, n = 0; i < equalSign - pos; ++i) {
692 if (pos[i] != ' ') {
693 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
694 }
695 }
696 keywordList[numKeywords].keyword[n] = 0;
697 keywordList[numKeywords].keywordLen = n;
698 /* now grab the value part. First we skip the '=' */
699 equalSign++;
700 /* then we leading spaces */
701 while(*equalSign == ' ') {
702 equalSign++;
703 }
704 keywordList[numKeywords].valueStart = equalSign;
705
706 pos = semicolon;
707 i = 0;
708 if(pos) {
709 while(*(pos - i - 1) == ' ') {
710 i++;
711 }
712 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
713 pos++;
714 } else {
715 i = (int32_t)uprv_strlen(equalSign);
716 /* BEGIN android-changed
717 For http://b/issue?id=6008774 : out-of-boundary memory access */
718 while(i && equalSign[i-1] == ' ') {
719 i--;
720 }
721 /* END android-changed */
722 keywordList[numKeywords].valueLen = i;
723 }
724 /* If this is a duplicate keyword, then ignore it */
725 for (j=0; j<numKeywords; ++j) {
726 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
727 duplicate = TRUE;
728 break;
729 }
730 }
731 if (!duplicate) {
732 ++numKeywords;
733 }
734 } while(pos);
735
736 /* Handle addKeyword/addValue. */
737 if (addKeyword != NULL) {
738 UBool duplicate = FALSE;
739 U_ASSERT(addValue != NULL);
740 /* Search for duplicate; if found, do nothing. Explicit keyword
741 overrides addKeyword. */
742 for (j=0; j<numKeywords; ++j) {
743 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
744 duplicate = TRUE;
745 break;
746 }
747 }
748 if (!duplicate) {
749 if (numKeywords == maxKeywords) {
750 *status = U_INTERNAL_PROGRAM_ERROR;
751 return 0;
752 }
753 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
754 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
755 keywordList[numKeywords].valueStart = addValue;
756 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
757 ++numKeywords;
758 }
759 } else {
760 U_ASSERT(addValue == NULL);
761 }
762
763 /* now we have a list of keywords */
764 /* we need to sort it */
765 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
766
767 /* Now construct the keyword part */
768 for(i = 0; i < numKeywords; i++) {
769 if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
770 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
771 if(valuesToo) {
772 keywords[keywordsLen + keywordList[i].keywordLen] = '=';
773 } else {
774 keywords[keywordsLen + keywordList[i].keywordLen] = 0;
775 }
776 }
777 keywordsLen += keywordList[i].keywordLen + 1;
778 if(valuesToo) {
779 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
780 uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
781 }
782 keywordsLen += keywordList[i].valueLen;
783
784 if(i < numKeywords - 1) {
785 if(keywordsLen < keywordCapacity) {
786 keywords[keywordsLen] = ';';
787 }
788 keywordsLen++;
789 }
790 }
791 if(values) {
792 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
793 uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
794 values[valuesLen + keywordList[i].valueLen] = 0;
795 }
796 valuesLen += keywordList[i].valueLen + 1;
797 }
798 }
799 if(values) {
800 values[valuesLen] = 0;
801 if(valLen) {
802 *valLen = valuesLen;
803 }
804 }
805 return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
806 } else {
807 return 0;
808 }
809 }
810
811 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)812 locale_getKeywords(const char *localeID,
813 char prev,
814 char *keywords, int32_t keywordCapacity,
815 char *values, int32_t valuesCapacity, int32_t *valLen,
816 UBool valuesToo,
817 UErrorCode *status) {
818 return _getKeywords(localeID, prev, keywords, keywordCapacity,
819 values, valuesCapacity, valLen, valuesToo,
820 NULL, NULL, status);
821 }
822
823 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)824 uloc_getKeywordValue(const char* localeID,
825 const char* keywordName,
826 char* buffer, int32_t bufferCapacity,
827 UErrorCode* status)
828 {
829 const char* startSearchHere = NULL;
830 const char* nextSeparator = NULL;
831 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
832 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
833 int32_t i = 0;
834 int32_t result = 0;
835
836 if(status && U_SUCCESS(*status) && localeID) {
837 char tempBuffer[ULOC_FULLNAME_CAPACITY];
838 const char* tmpLocaleID;
839
840 if (_hasBCP47Extension(localeID)) {
841 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
842 } else {
843 tmpLocaleID=localeID;
844 }
845
846 startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
847 if(startSearchHere == NULL) {
848 /* no keywords, return at once */
849 return 0;
850 }
851
852 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
853 if(U_FAILURE(*status)) {
854 return 0;
855 }
856
857 /* find the first keyword */
858 while(startSearchHere) {
859 startSearchHere++;
860 /* skip leading spaces (allowed?) */
861 while(*startSearchHere == ' ') {
862 startSearchHere++;
863 }
864 nextSeparator = uprv_strchr(startSearchHere, '=');
865 /* need to normalize both keyword and keyword name */
866 if(!nextSeparator) {
867 break;
868 }
869 if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
870 /* keyword name too long for internal buffer */
871 *status = U_INTERNAL_PROGRAM_ERROR;
872 return 0;
873 }
874 for(i = 0; i < nextSeparator - startSearchHere; i++) {
875 localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
876 }
877 /* trim trailing spaces */
878 while(startSearchHere[i-1] == ' ') {
879 i--;
880 }
881 localeKeywordNameBuffer[i] = 0;
882
883 startSearchHere = uprv_strchr(nextSeparator, ';');
884
885 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
886 nextSeparator++;
887 while(*nextSeparator == ' ') {
888 nextSeparator++;
889 }
890 /* we actually found the keyword. Copy the value */
891 if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
892 while(*(startSearchHere-1) == ' ') {
893 startSearchHere--;
894 }
895 uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
896 result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
897 } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
898 i = (int32_t)uprv_strlen(nextSeparator);
899 while(nextSeparator[i - 1] == ' ') {
900 i--;
901 }
902 uprv_strncpy(buffer, nextSeparator, i);
903 result = u_terminateChars(buffer, bufferCapacity, i, status);
904 } else {
905 /* give a bigger buffer, please */
906 *status = U_BUFFER_OVERFLOW_ERROR;
907 if(startSearchHere) {
908 result = (int32_t)(startSearchHere - nextSeparator);
909 } else {
910 result = (int32_t)uprv_strlen(nextSeparator);
911 }
912 }
913 return result;
914 }
915 }
916 }
917 return 0;
918 }
919
920 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)921 uloc_setKeywordValue(const char* keywordName,
922 const char* keywordValue,
923 char* buffer, int32_t bufferCapacity,
924 UErrorCode* status)
925 {
926 /* TODO: sorting. removal. */
927 int32_t keywordNameLen;
928 int32_t keywordValueLen;
929 int32_t bufLen;
930 int32_t needLen = 0;
931 int32_t foundValueLen;
932 int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
933 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
934 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
935 int32_t i = 0;
936 int32_t rc;
937 char* nextSeparator = NULL;
938 char* nextEqualsign = NULL;
939 char* startSearchHere = NULL;
940 char* keywordStart = NULL;
941 char *insertHere = NULL;
942 if(U_FAILURE(*status)) {
943 return -1;
944 }
945 if(bufferCapacity>1) {
946 bufLen = (int32_t)uprv_strlen(buffer);
947 } else {
948 *status = U_ILLEGAL_ARGUMENT_ERROR;
949 return 0;
950 }
951 if(bufferCapacity<bufLen) {
952 /* The capacity is less than the length?! Is this NULL terminated? */
953 *status = U_ILLEGAL_ARGUMENT_ERROR;
954 return 0;
955 }
956 if(keywordValue && !*keywordValue) {
957 keywordValue = NULL;
958 }
959 if(keywordValue) {
960 keywordValueLen = (int32_t)uprv_strlen(keywordValue);
961 } else {
962 keywordValueLen = 0;
963 }
964 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
965 if(U_FAILURE(*status)) {
966 return 0;
967 }
968 startSearchHere = (char*)locale_getKeywordsStart(buffer);
969 if(startSearchHere == NULL || (startSearchHere[1]==0)) {
970 if(!keywordValue) { /* no keywords = nothing to remove */
971 return bufLen;
972 }
973
974 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
975 if(startSearchHere) { /* had a single @ */
976 needLen--; /* already had the @ */
977 /* startSearchHere points at the @ */
978 } else {
979 startSearchHere=buffer+bufLen;
980 }
981 if(needLen >= bufferCapacity) {
982 *status = U_BUFFER_OVERFLOW_ERROR;
983 return needLen; /* no change */
984 }
985 *startSearchHere = '@';
986 startSearchHere++;
987 uprv_strcpy(startSearchHere, keywordNameBuffer);
988 startSearchHere += keywordNameLen;
989 *startSearchHere = '=';
990 startSearchHere++;
991 uprv_strcpy(startSearchHere, keywordValue);
992 startSearchHere+=keywordValueLen;
993 return needLen;
994 } /* end shortcut - no @ */
995
996 keywordStart = startSearchHere;
997 /* search for keyword */
998 while(keywordStart) {
999 keywordStart++;
1000 /* skip leading spaces (allowed?) */
1001 while(*keywordStart == ' ') {
1002 keywordStart++;
1003 }
1004 nextEqualsign = uprv_strchr(keywordStart, '=');
1005 /* need to normalize both keyword and keyword name */
1006 if(!nextEqualsign) {
1007 break;
1008 }
1009 if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1010 /* keyword name too long for internal buffer */
1011 *status = U_INTERNAL_PROGRAM_ERROR;
1012 return 0;
1013 }
1014 for(i = 0; i < nextEqualsign - keywordStart; i++) {
1015 localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1016 }
1017 /* trim trailing spaces */
1018 while(keywordStart[i-1] == ' ') {
1019 i--;
1020 }
1021 localeKeywordNameBuffer[i] = 0;
1022
1023 nextSeparator = uprv_strchr(nextEqualsign, ';');
1024 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1025 if(rc == 0) {
1026 nextEqualsign++;
1027 while(*nextEqualsign == ' ') {
1028 nextEqualsign++;
1029 }
1030 /* we actually found the keyword. Change the value */
1031 if (nextSeparator) {
1032 keywordAtEnd = 0;
1033 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1034 } else {
1035 keywordAtEnd = 1;
1036 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1037 }
1038 if(keywordValue) { /* adding a value - not removing */
1039 if(foundValueLen == keywordValueLen) {
1040 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1041 return bufLen; /* no change in size */
1042 } else if(foundValueLen > keywordValueLen) {
1043 int32_t delta = foundValueLen - keywordValueLen;
1044 if(nextSeparator) { /* RH side */
1045 uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1046 }
1047 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1048 bufLen -= delta;
1049 buffer[bufLen]=0;
1050 return bufLen;
1051 } else { /* FVL < KVL */
1052 int32_t delta = keywordValueLen - foundValueLen;
1053 if((bufLen+delta) >= bufferCapacity) {
1054 *status = U_BUFFER_OVERFLOW_ERROR;
1055 return bufLen+delta;
1056 }
1057 if(nextSeparator) { /* RH side */
1058 uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1059 }
1060 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1061 bufLen += delta;
1062 buffer[bufLen]=0;
1063 return bufLen;
1064 }
1065 } else { /* removing a keyword */
1066 if(keywordAtEnd) {
1067 /* zero out the ';' or '@' just before startSearchhere */
1068 keywordStart[-1] = 0;
1069 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1070 } else {
1071 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1072 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1073 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1074 }
1075 }
1076 } else if(rc<0){ /* end match keyword */
1077 /* could insert at this location. */
1078 insertHere = keywordStart;
1079 }
1080 keywordStart = nextSeparator;
1081 } /* end loop searching */
1082
1083 if(!keywordValue) {
1084 return bufLen; /* removal of non-extant keyword - no change */
1085 }
1086
1087 /* we know there is at least one keyword. */
1088 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1089 if(needLen >= bufferCapacity) {
1090 *status = U_BUFFER_OVERFLOW_ERROR;
1091 return needLen; /* no change */
1092 }
1093
1094 if(insertHere) {
1095 uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1096 keywordStart = insertHere;
1097 } else {
1098 keywordStart = buffer+bufLen;
1099 *keywordStart = ';';
1100 keywordStart++;
1101 }
1102 uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1103 keywordStart += keywordNameLen;
1104 *keywordStart = '=';
1105 keywordStart++;
1106 uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1107 keywordStart+=keywordValueLen;
1108 if(insertHere) {
1109 *keywordStart = ';';
1110 keywordStart++;
1111 }
1112 buffer[needLen]=0;
1113 return needLen;
1114 }
1115
1116 /* ### ID parsing implementation **************************************************/
1117
1118 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1119
1120 /*returns TRUE if one of the special prefixes is here (s=string)
1121 'x-' or 'i-' */
1122 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1123
1124 /* Dot terminates it because of POSIX form where dot precedes the codepage
1125 * except for variant
1126 */
1127 #define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1128
_strnchr(const char * str,int32_t len,char c)1129 static char* _strnchr(const char* str, int32_t len, char c) {
1130 U_ASSERT(str != 0 && len >= 0);
1131 while (len-- != 0) {
1132 char d = *str;
1133 if (d == c) {
1134 return (char*) str;
1135 } else if (d == 0) {
1136 break;
1137 }
1138 ++str;
1139 }
1140 return NULL;
1141 }
1142
1143 /**
1144 * Lookup 'key' in the array 'list'. The array 'list' should contain
1145 * a NULL entry, followed by more entries, and a second NULL entry.
1146 *
1147 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1148 * COUNTRIES_3.
1149 */
_findIndex(const char * const * list,const char * key)1150 static int16_t _findIndex(const char* const* list, const char* key)
1151 {
1152 const char* const* anchor = list;
1153 int32_t pass = 0;
1154
1155 /* Make two passes through two NULL-terminated arrays at 'list' */
1156 while (pass++ < 2) {
1157 while (*list) {
1158 if (uprv_strcmp(key, *list) == 0) {
1159 return (int16_t)(list - anchor);
1160 }
1161 list++;
1162 }
1163 ++list; /* skip final NULL *CWB*/
1164 }
1165 return -1;
1166 }
1167
1168 /* count the length of src while copying it to dest; return strlen(src) */
1169 static U_INLINE int32_t
_copyCount(char * dest,int32_t destCapacity,const char * src)1170 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1171 const char *anchor;
1172 char c;
1173
1174 anchor=src;
1175 for(;;) {
1176 if((c=*src)==0) {
1177 return (int32_t)(src-anchor);
1178 }
1179 if(destCapacity<=0) {
1180 return (int32_t)((src-anchor)+uprv_strlen(src));
1181 }
1182 ++src;
1183 *dest++=c;
1184 --destCapacity;
1185 }
1186 }
1187
1188 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1189 uloc_getCurrentCountryID(const char* oldID){
1190 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1191 if (offset >= 0) {
1192 return REPLACEMENT_COUNTRIES[offset];
1193 }
1194 return oldID;
1195 }
1196 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1197 uloc_getCurrentLanguageID(const char* oldID){
1198 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1199 if (offset >= 0) {
1200 return REPLACEMENT_LANGUAGES[offset];
1201 }
1202 return oldID;
1203 }
1204 /*
1205 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1206 * avoid duplicating code to handle the earlier locale ID pieces
1207 * in the functions for the later ones by
1208 * setting the *pEnd pointer to where they stopped parsing
1209 *
1210 * TODO try to use this in Locale
1211 */
1212 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1213 ulocimp_getLanguage(const char *localeID,
1214 char *language, int32_t languageCapacity,
1215 const char **pEnd) {
1216 int32_t i=0;
1217 int32_t offset;
1218 char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1219
1220 /* if it starts with i- or x- then copy that prefix */
1221 if(_isIDPrefix(localeID)) {
1222 if(i<languageCapacity) {
1223 language[i]=(char)uprv_tolower(*localeID);
1224 }
1225 if(i<languageCapacity) {
1226 language[i+1]='-';
1227 }
1228 i+=2;
1229 localeID+=2;
1230 }
1231
1232 /* copy the language as far as possible and count its length */
1233 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1234 if(i<languageCapacity) {
1235 language[i]=(char)uprv_tolower(*localeID);
1236 }
1237 if(i<3) {
1238 lang[i]=(char)uprv_tolower(*localeID);
1239 }
1240 i++;
1241 localeID++;
1242 }
1243
1244 if(i==3) {
1245 /* convert 3 character code to 2 character code if possible *CWB*/
1246 offset=_findIndex(LANGUAGES_3, lang);
1247 if(offset>=0) {
1248 i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1249 }
1250 }
1251
1252 if(pEnd!=NULL) {
1253 *pEnd=localeID;
1254 }
1255 return i;
1256 }
1257
1258 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1259 ulocimp_getScript(const char *localeID,
1260 char *script, int32_t scriptCapacity,
1261 const char **pEnd)
1262 {
1263 int32_t idLen = 0;
1264
1265 if (pEnd != NULL) {
1266 *pEnd = localeID;
1267 }
1268
1269 /* copy the second item as far as possible and count its length */
1270 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1271 idLen++;
1272 }
1273
1274 /* If it's exactly 4 characters long, then it's a script and not a country. */
1275 if (idLen == 4) {
1276 int32_t i;
1277 if (pEnd != NULL) {
1278 *pEnd = localeID+idLen;
1279 }
1280 if(idLen > scriptCapacity) {
1281 idLen = scriptCapacity;
1282 }
1283 if (idLen >= 1) {
1284 script[0]=(char)uprv_toupper(*(localeID++));
1285 }
1286 for (i = 1; i < idLen; i++) {
1287 script[i]=(char)uprv_tolower(*(localeID++));
1288 }
1289 }
1290 else {
1291 idLen = 0;
1292 }
1293 return idLen;
1294 }
1295
1296 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1297 ulocimp_getCountry(const char *localeID,
1298 char *country, int32_t countryCapacity,
1299 const char **pEnd)
1300 {
1301 int32_t idLen=0;
1302 char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1303 int32_t offset;
1304
1305 /* copy the country as far as possible and count its length */
1306 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1307 if(idLen<(ULOC_COUNTRY_CAPACITY-1)) { /*CWB*/
1308 cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1309 }
1310 idLen++;
1311 }
1312
1313 /* the country should be either length 2 or 3 */
1314 if (idLen == 2 || idLen == 3) {
1315 UBool gotCountry = FALSE;
1316 /* convert 3 character code to 2 character code if possible *CWB*/
1317 if(idLen==3) {
1318 offset=_findIndex(COUNTRIES_3, cnty);
1319 if(offset>=0) {
1320 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1321 gotCountry = TRUE;
1322 }
1323 }
1324 if (!gotCountry) {
1325 int32_t i = 0;
1326 for (i = 0; i < idLen; i++) {
1327 if (i < countryCapacity) {
1328 country[i]=(char)uprv_toupper(localeID[i]);
1329 }
1330 }
1331 }
1332 localeID+=idLen;
1333 } else {
1334 idLen = 0;
1335 }
1336
1337 if(pEnd!=NULL) {
1338 *pEnd=localeID;
1339 }
1340
1341 return idLen;
1342 }
1343
1344 /**
1345 * @param needSeparator if true, then add leading '_' if any variants
1346 * are added to 'variant'
1347 */
1348 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1349 _getVariantEx(const char *localeID,
1350 char prev,
1351 char *variant, int32_t variantCapacity,
1352 UBool needSeparator) {
1353 int32_t i=0;
1354
1355 /* get one or more variant tags and separate them with '_' */
1356 if(_isIDSeparator(prev)) {
1357 /* get a variant string after a '-' or '_' */
1358 while(!_isTerminator(*localeID)) {
1359 if (needSeparator) {
1360 if (i<variantCapacity) {
1361 variant[i] = '_';
1362 }
1363 ++i;
1364 needSeparator = FALSE;
1365 }
1366 if(i<variantCapacity) {
1367 variant[i]=(char)uprv_toupper(*localeID);
1368 if(variant[i]=='-') {
1369 variant[i]='_';
1370 }
1371 }
1372 i++;
1373 localeID++;
1374 }
1375 }
1376
1377 /* if there is no variant tag after a '-' or '_' then look for '@' */
1378 if(i==0) {
1379 if(prev=='@') {
1380 /* keep localeID */
1381 } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1382 ++localeID; /* point after the '@' */
1383 } else {
1384 return 0;
1385 }
1386 while(!_isTerminator(*localeID)) {
1387 if (needSeparator) {
1388 if (i<variantCapacity) {
1389 variant[i] = '_';
1390 }
1391 ++i;
1392 needSeparator = FALSE;
1393 }
1394 if(i<variantCapacity) {
1395 variant[i]=(char)uprv_toupper(*localeID);
1396 if(variant[i]=='-' || variant[i]==',') {
1397 variant[i]='_';
1398 }
1399 }
1400 i++;
1401 localeID++;
1402 }
1403 }
1404
1405 return i;
1406 }
1407
1408 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1409 _getVariant(const char *localeID,
1410 char prev,
1411 char *variant, int32_t variantCapacity) {
1412 return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1413 }
1414
1415 /**
1416 * Delete ALL instances of a variant from the given list of one or
1417 * more variants. Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1418 * @param variants the source string of one or more variants,
1419 * separated by '_'. This will be MODIFIED IN PLACE. Not zero
1420 * terminated; if it is, trailing zero will NOT be maintained.
1421 * @param variantsLen length of variants
1422 * @param toDelete variant to delete, without separators, e.g. "EURO"
1423 * or "PREEURO"; not zero terminated
1424 * @param toDeleteLen length of toDelete
1425 * @return number of characters deleted from variants
1426 */
1427 static int32_t
_deleteVariant(char * variants,int32_t variantsLen,const char * toDelete,int32_t toDeleteLen)1428 _deleteVariant(char* variants, int32_t variantsLen,
1429 const char* toDelete, int32_t toDeleteLen)
1430 {
1431 int32_t delta = 0; /* number of chars deleted */
1432 for (;;) {
1433 UBool flag = FALSE;
1434 if (variantsLen < toDeleteLen) {
1435 return delta;
1436 }
1437 if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1438 (variantsLen == toDeleteLen ||
1439 (flag=(variants[toDeleteLen] == '_'))))
1440 {
1441 int32_t d = toDeleteLen + (flag?1:0);
1442 variantsLen -= d;
1443 delta += d;
1444 if (variantsLen > 0) {
1445 uprv_memmove(variants, variants+d, variantsLen);
1446 }
1447 } else {
1448 char* p = _strnchr(variants, variantsLen, '_');
1449 if (p == NULL) {
1450 return delta;
1451 }
1452 ++p;
1453 variantsLen -= (int32_t)(p - variants);
1454 variants = p;
1455 }
1456 }
1457 }
1458
1459 /* Keyword enumeration */
1460
1461 typedef struct UKeywordsContext {
1462 char* keywords;
1463 char* current;
1464 } UKeywordsContext;
1465
1466 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1467 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1468 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1469 uprv_free(enumerator->context);
1470 uprv_free(enumerator);
1471 }
1472
1473 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode * status)1474 uloc_kw_countKeywords(UEnumeration *en, UErrorCode *status) {
1475 char *kw = ((UKeywordsContext *)en->context)->keywords;
1476 int32_t result = 0;
1477 while(*kw) {
1478 result++;
1479 kw += uprv_strlen(kw)+1;
1480 }
1481 return result;
1482 }
1483
1484 static const char* U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode * status)1485 uloc_kw_nextKeyword(UEnumeration* en,
1486 int32_t* resultLength,
1487 UErrorCode* status) {
1488 const char* result = ((UKeywordsContext *)en->context)->current;
1489 int32_t len = 0;
1490 if(*result) {
1491 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1492 ((UKeywordsContext *)en->context)->current += len+1;
1493 } else {
1494 result = NULL;
1495 }
1496 if (resultLength) {
1497 *resultLength = len;
1498 }
1499 return result;
1500 }
1501
1502 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode * status)1503 uloc_kw_resetKeywords(UEnumeration* en,
1504 UErrorCode* status) {
1505 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1506 }
1507
1508 static const UEnumeration gKeywordsEnum = {
1509 NULL,
1510 NULL,
1511 uloc_kw_closeKeywords,
1512 uloc_kw_countKeywords,
1513 uenum_unextDefault,
1514 uloc_kw_nextKeyword,
1515 uloc_kw_resetKeywords
1516 };
1517
1518 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1519 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1520 {
1521 UKeywordsContext *myContext = NULL;
1522 UEnumeration *result = NULL;
1523
1524 if(U_FAILURE(*status)) {
1525 return NULL;
1526 }
1527 result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1528 /* Null pointer test */
1529 if (result == NULL) {
1530 *status = U_MEMORY_ALLOCATION_ERROR;
1531 return NULL;
1532 }
1533 uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1534 myContext = uprv_malloc(sizeof(UKeywordsContext));
1535 if (myContext == NULL) {
1536 *status = U_MEMORY_ALLOCATION_ERROR;
1537 uprv_free(result);
1538 return NULL;
1539 }
1540 myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1541 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1542 myContext->keywords[keywordListSize] = 0;
1543 myContext->current = myContext->keywords;
1544 result->context = myContext;
1545 return result;
1546 }
1547
1548 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1549 uloc_openKeywords(const char* localeID,
1550 UErrorCode* status)
1551 {
1552 int32_t i=0;
1553 char keywords[256];
1554 int32_t keywordsCapacity = 256;
1555 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1556 const char* tmpLocaleID;
1557
1558 if(status==NULL || U_FAILURE(*status)) {
1559 return 0;
1560 }
1561
1562 if (_hasBCP47Extension(localeID)) {
1563 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1564 } else {
1565 if (localeID==NULL) {
1566 localeID=uloc_getDefault();
1567 }
1568 tmpLocaleID=localeID;
1569 }
1570
1571 /* Skip the language */
1572 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1573 if(_isIDSeparator(*tmpLocaleID)) {
1574 const char *scriptID;
1575 /* Skip the script if available */
1576 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1577 if(scriptID != tmpLocaleID+1) {
1578 /* Found optional script */
1579 tmpLocaleID = scriptID;
1580 }
1581 /* Skip the Country */
1582 if (_isIDSeparator(*tmpLocaleID)) {
1583 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1584 if(_isIDSeparator(*tmpLocaleID)) {
1585 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1586 }
1587 }
1588 }
1589
1590 /* keywords are located after '@' */
1591 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1592 i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1593 }
1594
1595 if(i) {
1596 return uloc_openKeywordList(keywords, i, status);
1597 } else {
1598 return NULL;
1599 }
1600 }
1601
1602
1603 /* bit-flags for 'options' parameter of _canonicalize */
1604 #define _ULOC_STRIP_KEYWORDS 0x2
1605 #define _ULOC_CANONICALIZE 0x1
1606
1607 #define OPTION_SET(options, mask) ((options & mask) != 0)
1608
1609 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1610 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1611
1612 /**
1613 * Canonicalize the given localeID, to level 1 or to level 2,
1614 * depending on the options. To specify level 1, pass in options=0.
1615 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1616 *
1617 * This is the code underlying uloc_getName and uloc_canonicalize.
1618 */
1619 static int32_t
_canonicalize(const char * localeID,char * result,int32_t resultCapacity,uint32_t options,UErrorCode * err)1620 _canonicalize(const char* localeID,
1621 char* result,
1622 int32_t resultCapacity,
1623 uint32_t options,
1624 UErrorCode* err) {
1625 int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1626 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1627 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1628 const char* origLocaleID;
1629 const char* tmpLocaleID;
1630 const char* keywordAssign = NULL;
1631 const char* separatorIndicator = NULL;
1632 const char* addKeyword = NULL;
1633 const char* addValue = NULL;
1634 char* name;
1635 char* variant = NULL; /* pointer into name, or NULL */
1636
1637 if (U_FAILURE(*err)) {
1638 return 0;
1639 }
1640
1641 if (_hasBCP47Extension(localeID)) {
1642 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1643 } else {
1644 if (localeID==NULL) {
1645 localeID=uloc_getDefault();
1646 }
1647 tmpLocaleID=localeID;
1648 }
1649
1650 origLocaleID=tmpLocaleID;
1651
1652 /* if we are doing a full canonicalization, then put results in
1653 localeBuffer, if necessary; otherwise send them to result. */
1654 if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1655 (result == NULL || resultCapacity < sizeof(localeBuffer))) {
1656 name = localeBuffer;
1657 nameCapacity = sizeof(localeBuffer);
1658 } else {
1659 name = result;
1660 nameCapacity = resultCapacity;
1661 }
1662
1663 /* get all pieces, one after another, and separate with '_' */
1664 len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1665
1666 if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1667 const char *d = uloc_getDefault();
1668
1669 len = (int32_t)uprv_strlen(d);
1670
1671 if (name != NULL) {
1672 uprv_strncpy(name, d, len);
1673 }
1674 } else if(_isIDSeparator(*tmpLocaleID)) {
1675 const char *scriptID;
1676
1677 ++fieldCount;
1678 if(len<nameCapacity) {
1679 name[len]='_';
1680 }
1681 ++len;
1682
1683 scriptSize=ulocimp_getScript(tmpLocaleID+1, name+len, nameCapacity-len, &scriptID);
1684 if(scriptSize > 0) {
1685 /* Found optional script */
1686 tmpLocaleID = scriptID;
1687 ++fieldCount;
1688 len+=scriptSize;
1689 if (_isIDSeparator(*tmpLocaleID)) {
1690 /* If there is something else, then we add the _ */
1691 if(len<nameCapacity) {
1692 name[len]='_';
1693 }
1694 ++len;
1695 }
1696 }
1697
1698 if (_isIDSeparator(*tmpLocaleID)) {
1699 const char *cntryID;
1700 int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1, name+len, nameCapacity-len, &cntryID);
1701 if (cntrySize > 0) {
1702 /* Found optional country */
1703 tmpLocaleID = cntryID;
1704 len+=cntrySize;
1705 }
1706 if(_isIDSeparator(*tmpLocaleID)) {
1707 /* If there is something else, then we add the _ if we found country before.*/
1708 if (cntrySize > 0) {
1709 ++fieldCount;
1710 if(len<nameCapacity) {
1711 name[len]='_';
1712 }
1713 ++len;
1714 }
1715
1716 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID, name+len, nameCapacity-len);
1717 if (variantSize > 0) {
1718 variant = name+len;
1719 len += variantSize;
1720 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1721 }
1722 }
1723 }
1724 }
1725
1726 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1727 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1728 UBool done = FALSE;
1729 do {
1730 char c = *tmpLocaleID;
1731 switch (c) {
1732 case 0:
1733 case '@':
1734 done = TRUE;
1735 break;
1736 default:
1737 if (len<nameCapacity) {
1738 name[len] = c;
1739 }
1740 ++len;
1741 ++tmpLocaleID;
1742 break;
1743 }
1744 } while (!done);
1745 }
1746
1747 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1748 After this, tmpLocaleID either points to '@' or is NULL */
1749 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1750 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1751 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1752 }
1753
1754 /* Copy POSIX-style variant, if any [mr@FOO] */
1755 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1756 tmpLocaleID != NULL && keywordAssign == NULL) {
1757 for (;;) {
1758 char c = *tmpLocaleID;
1759 if (c == 0) {
1760 break;
1761 }
1762 if (len<nameCapacity) {
1763 name[len] = c;
1764 }
1765 ++len;
1766 ++tmpLocaleID;
1767 }
1768 }
1769
1770 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1771 /* Handle @FOO variant if @ is present and not followed by = */
1772 if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1773 int32_t posixVariantSize;
1774 /* Add missing '_' if needed */
1775 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1776 do {
1777 if(len<nameCapacity) {
1778 name[len]='_';
1779 }
1780 ++len;
1781 ++fieldCount;
1782 } while(fieldCount<2);
1783 }
1784 posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1785 (UBool)(variantSize > 0));
1786 if (posixVariantSize > 0) {
1787 if (variant == NULL) {
1788 variant = name+len;
1789 }
1790 len += posixVariantSize;
1791 variantSize += posixVariantSize;
1792 }
1793 }
1794
1795 /* Handle generic variants first */
1796 if (variant) {
1797 for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1798 const char* variantToCompare = VARIANT_MAP[j].variant;
1799 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1800 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1801 len -= variantLen;
1802 if (variantLen > 0) {
1803 /* BEGIN android-changed
1804 Apply fixes for ICU ticket8984. */
1805 if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1806 --len;
1807 }
1808 /* END android-changed */
1809 addKeyword = VARIANT_MAP[j].keyword;
1810 addValue = VARIANT_MAP[j].value;
1811 break;
1812 }
1813 }
1814 /* BEGIN android-changed
1815 Apply fixes for ICU ticket8984. */
1816 if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1817 --len;
1818 }
1819 /* END android-changed */
1820 }
1821
1822 /* Look up the ID in the canonicalization map */
1823 for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1824 const char* id = CANONICALIZE_MAP[j].id;
1825 int32_t n = (int32_t)uprv_strlen(id);
1826 if (len == n && uprv_strncmp(name, id, n) == 0) {
1827 if (n == 0 && tmpLocaleID != NULL) {
1828 break; /* Don't remap "" if keywords present */
1829 }
1830 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1831 if (CANONICALIZE_MAP[j].keyword) {
1832 addKeyword = CANONICALIZE_MAP[j].keyword;
1833 addValue = CANONICALIZE_MAP[j].value;
1834 }
1835 break;
1836 }
1837 }
1838 }
1839
1840 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1841 if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1842 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1843 if(len<nameCapacity) {
1844 name[len]='@';
1845 }
1846 ++len;
1847 ++fieldCount;
1848 len += _getKeywords(tmpLocaleID+1, '@', name+len, nameCapacity-len, NULL, 0, NULL, TRUE,
1849 addKeyword, addValue, err);
1850 } else if (addKeyword != NULL) {
1851 U_ASSERT(addValue != NULL);
1852 /* inelegant but works -- later make _getKeywords do this? */
1853 len += _copyCount(name+len, nameCapacity-len, "@");
1854 len += _copyCount(name+len, nameCapacity-len, addKeyword);
1855 len += _copyCount(name+len, nameCapacity-len, "=");
1856 len += _copyCount(name+len, nameCapacity-len, addValue);
1857 }
1858 }
1859
1860 if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1861 uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1862 }
1863
1864 return u_terminateChars(result, resultCapacity, len, err);
1865 }
1866
1867 /* ### ID parsing API **************************************************/
1868
1869 U_CAPI int32_t U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1870 uloc_getParent(const char* localeID,
1871 char* parent,
1872 int32_t parentCapacity,
1873 UErrorCode* err)
1874 {
1875 const char *lastUnderscore;
1876 int32_t i;
1877
1878 if (U_FAILURE(*err))
1879 return 0;
1880
1881 if (localeID == NULL)
1882 localeID = uloc_getDefault();
1883
1884 lastUnderscore=uprv_strrchr(localeID, '_');
1885 if(lastUnderscore!=NULL) {
1886 i=(int32_t)(lastUnderscore-localeID);
1887 } else {
1888 i=0;
1889 }
1890
1891 if(i>0 && parent != localeID) {
1892 uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1893 }
1894 return u_terminateChars(parent, parentCapacity, i, err);
1895 }
1896
1897 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1898 uloc_getLanguage(const char* localeID,
1899 char* language,
1900 int32_t languageCapacity,
1901 UErrorCode* err)
1902 {
1903 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1904 int32_t i=0;
1905
1906 if (err==NULL || U_FAILURE(*err)) {
1907 return 0;
1908 }
1909
1910 if(localeID==NULL) {
1911 localeID=uloc_getDefault();
1912 }
1913
1914 i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1915 return u_terminateChars(language, languageCapacity, i, err);
1916 }
1917
1918 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1919 uloc_getScript(const char* localeID,
1920 char* script,
1921 int32_t scriptCapacity,
1922 UErrorCode* err)
1923 {
1924 int32_t i=0;
1925
1926 if(err==NULL || U_FAILURE(*err)) {
1927 return 0;
1928 }
1929
1930 if(localeID==NULL) {
1931 localeID=uloc_getDefault();
1932 }
1933
1934 /* skip the language */
1935 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1936 if(_isIDSeparator(*localeID)) {
1937 i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1938 }
1939 return u_terminateChars(script, scriptCapacity, i, err);
1940 }
1941
1942 U_CAPI int32_t U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1943 uloc_getCountry(const char* localeID,
1944 char* country,
1945 int32_t countryCapacity,
1946 UErrorCode* err)
1947 {
1948 int32_t i=0;
1949
1950 if(err==NULL || U_FAILURE(*err)) {
1951 return 0;
1952 }
1953
1954 if(localeID==NULL) {
1955 localeID=uloc_getDefault();
1956 }
1957
1958 /* Skip the language */
1959 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1960 if(_isIDSeparator(*localeID)) {
1961 const char *scriptID;
1962 /* Skip the script if available */
1963 ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1964 if(scriptID != localeID+1) {
1965 /* Found optional script */
1966 localeID = scriptID;
1967 }
1968 if(_isIDSeparator(*localeID)) {
1969 i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1970 }
1971 }
1972 return u_terminateChars(country, countryCapacity, i, err);
1973 }
1974
1975 U_CAPI int32_t U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1976 uloc_getVariant(const char* localeID,
1977 char* variant,
1978 int32_t variantCapacity,
1979 UErrorCode* err)
1980 {
1981 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1982 const char* tmpLocaleID;
1983 int32_t i=0;
1984
1985 if(err==NULL || U_FAILURE(*err)) {
1986 return 0;
1987 }
1988
1989 if (_hasBCP47Extension(localeID)) {
1990 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1991 } else {
1992 if (localeID==NULL) {
1993 localeID=uloc_getDefault();
1994 }
1995 tmpLocaleID=localeID;
1996 }
1997
1998 /* Skip the language */
1999 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2000 if(_isIDSeparator(*tmpLocaleID)) {
2001 const char *scriptID;
2002 /* Skip the script if available */
2003 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2004 if(scriptID != tmpLocaleID+1) {
2005 /* Found optional script */
2006 tmpLocaleID = scriptID;
2007 }
2008 /* Skip the Country */
2009 if (_isIDSeparator(*tmpLocaleID)) {
2010 const char *cntryID;
2011 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2012 if (cntryID != tmpLocaleID+1) {
2013 /* Found optional country */
2014 tmpLocaleID = cntryID;
2015 }
2016 if(_isIDSeparator(*tmpLocaleID)) {
2017 /* If there was no country ID, skip a possible extra IDSeparator */
2018 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2019 tmpLocaleID++;
2020 }
2021 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2022 }
2023 }
2024 }
2025
2026 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2027 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2028 /*
2029 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2030 i=_getVariant(localeID+1, '@', variant, variantCapacity);
2031 }
2032 */
2033 return u_terminateChars(variant, variantCapacity, i, err);
2034 }
2035
2036 U_CAPI int32_t U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2037 uloc_getName(const char* localeID,
2038 char* name,
2039 int32_t nameCapacity,
2040 UErrorCode* err)
2041 {
2042 return _canonicalize(localeID, name, nameCapacity, 0, err);
2043 }
2044
2045 U_CAPI int32_t U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2046 uloc_getBaseName(const char* localeID,
2047 char* name,
2048 int32_t nameCapacity,
2049 UErrorCode* err)
2050 {
2051 return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2052 }
2053
2054 U_CAPI int32_t U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2055 uloc_canonicalize(const char* localeID,
2056 char* name,
2057 int32_t nameCapacity,
2058 UErrorCode* err)
2059 {
2060 return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2061 }
2062
2063 U_CAPI const char* U_EXPORT2
uloc_getISO3Language(const char * localeID)2064 uloc_getISO3Language(const char* localeID)
2065 {
2066 int16_t offset;
2067 char lang[ULOC_LANG_CAPACITY];
2068 UErrorCode err = U_ZERO_ERROR;
2069
2070 if (localeID == NULL)
2071 {
2072 localeID = uloc_getDefault();
2073 }
2074 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2075 if (U_FAILURE(err))
2076 return "";
2077 offset = _findIndex(LANGUAGES, lang);
2078 if (offset < 0)
2079 return "";
2080 return LANGUAGES_3[offset];
2081 }
2082
2083 U_CAPI const char* U_EXPORT2
uloc_getISO3Country(const char * localeID)2084 uloc_getISO3Country(const char* localeID)
2085 {
2086 int16_t offset;
2087 char cntry[ULOC_LANG_CAPACITY];
2088 UErrorCode err = U_ZERO_ERROR;
2089
2090 if (localeID == NULL)
2091 {
2092 localeID = uloc_getDefault();
2093 }
2094 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2095 if (U_FAILURE(err))
2096 return "";
2097 offset = _findIndex(COUNTRIES, cntry);
2098 if (offset < 0)
2099 return "";
2100
2101 return COUNTRIES_3[offset];
2102 }
2103
2104 U_CAPI uint32_t U_EXPORT2
uloc_getLCID(const char * localeID)2105 uloc_getLCID(const char* localeID)
2106 {
2107 UErrorCode status = U_ZERO_ERROR;
2108 char langID[ULOC_FULLNAME_CAPACITY];
2109
2110 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2111 if (U_FAILURE(status)) {
2112 return 0;
2113 }
2114
2115 return uprv_convertToLCID(langID, localeID, &status);
2116 }
2117
2118 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2119 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2120 UErrorCode *status)
2121 {
2122 int32_t length;
2123 const char *posix = uprv_convertToPosix(hostid, status);
2124 if (U_FAILURE(*status) || posix == NULL) {
2125 return 0;
2126 }
2127 length = (int32_t)uprv_strlen(posix);
2128 if (length+1 > localeCapacity) {
2129 *status = U_BUFFER_OVERFLOW_ERROR;
2130 }
2131 else {
2132 uprv_strcpy(locale, posix);
2133 }
2134 return length;
2135 }
2136
2137 /* ### Default locale **************************************************/
2138
2139 U_CAPI const char* U_EXPORT2
uloc_getDefault()2140 uloc_getDefault()
2141 {
2142 return locale_get_default();
2143 }
2144
2145 U_CAPI void U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2146 uloc_setDefault(const char* newDefaultLocale,
2147 UErrorCode* err)
2148 {
2149 if (U_FAILURE(*err))
2150 return;
2151 /* the error code isn't currently used for anything by this function*/
2152
2153 /* propagate change to C++ */
2154 locale_set_default(newDefaultLocale);
2155 }
2156
2157 /**
2158 * Returns a list of all language codes defined in ISO 639. This is a pointer
2159 * to an array of pointers to arrays of char. All of these pointers are owned
2160 * by ICU-- do not delete them, and do not write through them. The array is
2161 * terminated with a null pointer.
2162 */
2163 U_CAPI const char* const* U_EXPORT2
uloc_getISOLanguages()2164 uloc_getISOLanguages()
2165 {
2166 return LANGUAGES;
2167 }
2168
2169 /**
2170 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2171 * pointer to an array of pointers to arrays of char. All of these pointers are
2172 * owned by ICU-- do not delete them, and do not write through them. The array is
2173 * terminated with a null pointer.
2174 */
2175 U_CAPI const char* const* U_EXPORT2
uloc_getISOCountries()2176 uloc_getISOCountries()
2177 {
2178 return COUNTRIES;
2179 }
2180
2181
2182 /* this function to be moved into cstring.c later */
2183 static char gDecimal = 0;
2184
2185 static /* U_CAPI */
2186 double
2187 /* U_EXPORT2 */
_uloc_strtod(const char * start,char ** end)2188 _uloc_strtod(const char *start, char **end) {
2189 char *decimal;
2190 char *myEnd;
2191 char buf[30];
2192 double rv;
2193 if (!gDecimal) {
2194 char rep[5];
2195 /* For machines that decide to change the decimal on you,
2196 and try to be too smart with localization.
2197 This normally should be just a '.'. */
2198 sprintf(rep, "%+1.1f", 1.0);
2199 gDecimal = rep[2];
2200 }
2201
2202 if(gDecimal == '.') {
2203 return uprv_strtod(start, end); /* fall through to OS */
2204 } else {
2205 uprv_strncpy(buf, start, 29);
2206 buf[29]=0;
2207 decimal = uprv_strchr(buf, '.');
2208 if(decimal) {
2209 *decimal = gDecimal;
2210 } else {
2211 return uprv_strtod(start, end); /* no decimal point */
2212 }
2213 rv = uprv_strtod(buf, &myEnd);
2214 if(end) {
2215 *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2216 }
2217 return rv;
2218 }
2219 }
2220
2221 typedef struct {
2222 float q;
2223 int32_t dummy; /* to avoid uninitialized memory copy from qsort */
2224 char *locale;
2225 } _acceptLangItem;
2226
2227 static int32_t U_CALLCONV
uloc_acceptLanguageCompare(const void * context,const void * a,const void * b)2228 uloc_acceptLanguageCompare(const void *context, const void *a, const void *b)
2229 {
2230 const _acceptLangItem *aa = (const _acceptLangItem*)a;
2231 const _acceptLangItem *bb = (const _acceptLangItem*)b;
2232
2233 int32_t rc = 0;
2234 if(bb->q < aa->q) {
2235 rc = -1; /* A > B */
2236 } else if(bb->q > aa->q) {
2237 rc = 1; /* A < B */
2238 } else {
2239 rc = 0; /* A = B */
2240 }
2241
2242 if(rc==0) {
2243 rc = uprv_stricmp(aa->locale, bb->locale);
2244 }
2245
2246 #if defined(ULOC_DEBUG)
2247 /* fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2248 aa->locale, aa->q,
2249 bb->locale, bb->q,
2250 rc);*/
2251 #endif
2252
2253 return rc;
2254 }
2255
2256 /*
2257 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2258 */
2259
2260 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char * httpAcceptLanguage,UEnumeration * availableLocales,UErrorCode * status)2261 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2262 const char *httpAcceptLanguage,
2263 UEnumeration* availableLocales,
2264 UErrorCode *status)
2265 {
2266 _acceptLangItem *j;
2267 _acceptLangItem smallBuffer[30];
2268 char **strs;
2269 char tmp[ULOC_FULLNAME_CAPACITY +1];
2270 int32_t n = 0;
2271 const char *itemEnd;
2272 const char *paramEnd;
2273 const char *s;
2274 const char *t;
2275 int32_t res;
2276 int32_t i;
2277 int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2278 int32_t jSize;
2279 char *tempstr; /* Use for null pointer check */
2280
2281 j = smallBuffer;
2282 jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2283 if(U_FAILURE(*status)) {
2284 return -1;
2285 }
2286
2287 for(s=httpAcceptLanguage;s&&*s;) {
2288 while(isspace(*s)) /* eat space at the beginning */
2289 s++;
2290 itemEnd=uprv_strchr(s,',');
2291 paramEnd=uprv_strchr(s,';');
2292 if(!itemEnd) {
2293 itemEnd = httpAcceptLanguage+l; /* end of string */
2294 }
2295 if(paramEnd && paramEnd<itemEnd) {
2296 /* semicolon (;) is closer than end (,) */
2297 t = paramEnd+1;
2298 if(*t=='q') {
2299 t++;
2300 }
2301 while(isspace(*t)) {
2302 t++;
2303 }
2304 if(*t=='=') {
2305 t++;
2306 }
2307 while(isspace(*t)) {
2308 t++;
2309 }
2310 j[n].q = (float)_uloc_strtod(t,NULL);
2311 } else {
2312 /* no semicolon - it's 1.0 */
2313 j[n].q = 1.0f;
2314 paramEnd = itemEnd;
2315 }
2316 j[n].dummy=0;
2317 /* eat spaces prior to semi */
2318 for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2319 ;
2320 /* Check for null pointer from uprv_strndup */
2321 tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2322 if (tempstr == NULL) {
2323 *status = U_MEMORY_ALLOCATION_ERROR;
2324 return -1;
2325 }
2326 j[n].locale = tempstr;
2327 uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2328 if(strcmp(j[n].locale,tmp)) {
2329 uprv_free(j[n].locale);
2330 j[n].locale=uprv_strdup(tmp);
2331 }
2332 #if defined(ULOC_DEBUG)
2333 /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2334 #endif
2335 n++;
2336 s = itemEnd;
2337 while(*s==',') { /* eat duplicate commas */
2338 s++;
2339 }
2340 if(n>=jSize) {
2341 if(j==smallBuffer) { /* overflowed the small buffer. */
2342 j = uprv_malloc(sizeof(j[0])*(jSize*2));
2343 if(j!=NULL) {
2344 uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2345 }
2346 #if defined(ULOC_DEBUG)
2347 fprintf(stderr,"malloced at size %d\n", jSize);
2348 #endif
2349 } else {
2350 j = uprv_realloc(j, sizeof(j[0])*jSize*2);
2351 #if defined(ULOC_DEBUG)
2352 fprintf(stderr,"re-alloced at size %d\n", jSize);
2353 #endif
2354 }
2355 jSize *= 2;
2356 if(j==NULL) {
2357 *status = U_MEMORY_ALLOCATION_ERROR;
2358 return -1;
2359 }
2360 }
2361 }
2362 uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2363 if(U_FAILURE(*status)) {
2364 if(j != smallBuffer) {
2365 #if defined(ULOC_DEBUG)
2366 fprintf(stderr,"freeing j %p\n", j);
2367 #endif
2368 uprv_free(j);
2369 }
2370 return -1;
2371 }
2372 strs = uprv_malloc((size_t)(sizeof(strs[0])*n));
2373 /* Check for null pointer */
2374 if (strs == NULL) {
2375 uprv_free(j); /* Free to avoid memory leak */
2376 *status = U_MEMORY_ALLOCATION_ERROR;
2377 return -1;
2378 }
2379 for(i=0;i<n;i++) {
2380 #if defined(ULOC_DEBUG)
2381 /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2382 #endif
2383 strs[i]=j[i].locale;
2384 }
2385 res = uloc_acceptLanguage(result, resultAvailable, outResult,
2386 (const char**)strs, n, availableLocales, status);
2387 for(i=0;i<n;i++) {
2388 uprv_free(strs[i]);
2389 }
2390 uprv_free(strs);
2391 if(j != smallBuffer) {
2392 #if defined(ULOC_DEBUG)
2393 fprintf(stderr,"freeing j %p\n", j);
2394 #endif
2395 uprv_free(j);
2396 }
2397 return res;
2398 }
2399
2400
2401 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char ** acceptList,int32_t acceptListCount,UEnumeration * availableLocales,UErrorCode * status)2402 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2403 UAcceptResult *outResult, const char **acceptList,
2404 int32_t acceptListCount,
2405 UEnumeration* availableLocales,
2406 UErrorCode *status)
2407 {
2408 int32_t i,j;
2409 int32_t len;
2410 int32_t maxLen=0;
2411 char tmp[ULOC_FULLNAME_CAPACITY+1];
2412 const char *l;
2413 char **fallbackList;
2414 if(U_FAILURE(*status)) {
2415 return -1;
2416 }
2417 fallbackList = uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount));
2418 if(fallbackList==NULL) {
2419 *status = U_MEMORY_ALLOCATION_ERROR;
2420 return -1;
2421 }
2422 for(i=0;i<acceptListCount;i++) {
2423 #if defined(ULOC_DEBUG)
2424 fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2425 #endif
2426 while((l=uenum_next(availableLocales, NULL, status))) {
2427 #if defined(ULOC_DEBUG)
2428 fprintf(stderr," %s\n", l);
2429 #endif
2430 len = (int32_t)uprv_strlen(l);
2431 if(!uprv_strcmp(acceptList[i], l)) {
2432 if(outResult) {
2433 *outResult = ULOC_ACCEPT_VALID;
2434 }
2435 #if defined(ULOC_DEBUG)
2436 fprintf(stderr, "MATCH! %s\n", l);
2437 #endif
2438 if(len>0) {
2439 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2440 }
2441 for(j=0;j<i;j++) {
2442 uprv_free(fallbackList[j]);
2443 }
2444 uprv_free(fallbackList);
2445 return u_terminateChars(result, resultAvailable, len, status);
2446 }
2447 if(len>maxLen) {
2448 maxLen = len;
2449 }
2450 }
2451 uenum_reset(availableLocales, status);
2452 /* save off parent info */
2453 if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2454 fallbackList[i] = uprv_strdup(tmp);
2455 } else {
2456 fallbackList[i]=0;
2457 }
2458 }
2459
2460 for(maxLen--;maxLen>0;maxLen--) {
2461 for(i=0;i<acceptListCount;i++) {
2462 if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2463 #if defined(ULOC_DEBUG)
2464 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2465 #endif
2466 while((l=uenum_next(availableLocales, NULL, status))) {
2467 #if defined(ULOC_DEBUG)
2468 fprintf(stderr," %s\n", l);
2469 #endif
2470 len = (int32_t)uprv_strlen(l);
2471 if(!uprv_strcmp(fallbackList[i], l)) {
2472 if(outResult) {
2473 *outResult = ULOC_ACCEPT_FALLBACK;
2474 }
2475 #if defined(ULOC_DEBUG)
2476 fprintf(stderr, "fallback MATCH! %s\n", l);
2477 #endif
2478 if(len>0) {
2479 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2480 }
2481 for(j=0;j<acceptListCount;j++) {
2482 uprv_free(fallbackList[j]);
2483 }
2484 uprv_free(fallbackList);
2485 return u_terminateChars(result, resultAvailable, len, status);
2486 }
2487 }
2488 uenum_reset(availableLocales, status);
2489
2490 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2491 uprv_free(fallbackList[i]);
2492 fallbackList[i] = uprv_strdup(tmp);
2493 } else {
2494 uprv_free(fallbackList[i]);
2495 fallbackList[i]=0;
2496 }
2497 }
2498 }
2499 if(outResult) {
2500 *outResult = ULOC_ACCEPT_FAILED;
2501 }
2502 }
2503 for(i=0;i<acceptListCount;i++) {
2504 uprv_free(fallbackList[i]);
2505 }
2506 uprv_free(fallbackList);
2507 return -1;
2508 }
2509
2510 /*eof*/
2511