1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 04/01/97 aliu Creation.
15 * 08/21/98 stephen JDK 1.2 sync
16 * 12/08/98 rtg New Locale implementation and C API
17 * 03/15/99 damiba overhaul.
18 * 04/06/99 stephen changed setDefault() to realloc and copy
19 * 06/14/99 stephen Changed calls to ures_open for new params
20 * 07/21/99 stephen Modified setDefault() to propagate to C++
21 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22 * brought canonicalization code into line with spec
23 *****************************************************************************/
24
25 /*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31 */
32
33 #include "unicode/utypes.h"
34 #include "unicode/ustring.h"
35 #include "unicode/uloc.h"
36
37 #include "putilimp.h"
38 #include "ustr_imp.h"
39 #include "ulocimp.h"
40 #include "umutex.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43 #include "locmap.h"
44 #include "uarrsort.h"
45 #include "uenumimp.h"
46 #include "uassert.h"
47
48 #include <stdio.h> /* for sprintf */
49
50 using namespace icu;
51
52 /* ### Declarations **************************************************/
53
54 /* Locale stuff from locid.cpp */
55 U_CFUNC void locale_set_default(const char *id);
56 U_CFUNC const char *locale_get_default(void);
57 U_CFUNC int32_t
58 locale_getKeywords(const char *localeID,
59 char prev,
60 char *keywords, int32_t keywordCapacity,
61 char *values, int32_t valuesCapacity, int32_t *valLen,
62 UBool valuesToo,
63 UErrorCode *status);
64
65 /* ### Data tables **************************************************/
66
67 /**
68 * Table of language codes, both 2- and 3-letter, with preference
69 * given to 2-letter codes where possible. Includes 3-letter codes
70 * that lack a 2-letter equivalent.
71 *
72 * This list must be in sorted order. This list is returned directly
73 * to the user by some API.
74 *
75 * This list must be kept in sync with LANGUAGES_3, with corresponding
76 * entries matched.
77 *
78 * This table should be terminated with a NULL entry, followed by a
79 * second list, and another NULL entry. The first list is visible to
80 * user code when this array is returned by API. The second list
81 * contains codes we support, but do not expose through user API.
82 *
83 * Notes
84 *
85 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
86 * include the revisions up to 2001/7/27 *CWB*
87 *
88 * The 3 character codes are the terminology codes like RFC 3066. This
89 * is compatible with prior ICU codes
90 *
91 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
92 * table but now at the end of the table because 3 character codes are
93 * duplicates. This avoids bad searches going from 3 to 2 character
94 * codes.
95 *
96 * The range qaa-qtz is reserved for local use
97 */
98 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
99 /* ISO639 table version is 20150505 */
100 static const char * const LANGUAGES[] = {
101 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
102 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
103 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
104 "arn", "aro", "arp", "arq", "arw", "ary", "arz", "as",
105 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
106 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
107 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
108 "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
109 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
110 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
111 "ca", "cad", "car", "cay", "cch", "ce", "ceb", "cgg",
112 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
113 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
114 "cs", "csb", "cu", "cv", "cy",
115 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
116 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
117 "dyo", "dyu", "dz", "dzg",
118 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
119 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
120 "ext",
121 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
122 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
123 "frs", "fur", "fy",
124 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
125 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
126 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
127 "gur", "guz", "gv", "gwi",
128 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
129 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
130 "hup", "hy", "hz",
131 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
132 "ilo", "inh", "io", "is", "it", "iu", "izh",
133 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
134 "jv",
135 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
136 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
137 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
138 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
139 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
140 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
141 "kv", "kw", "ky",
142 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
143 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
144 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
145 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
146 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
147 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
148 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
149 "ml", "mn", "mnc", "mni", "moh", "mos", "mr", "mrj",
150 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
151 "my", "mye", "myv", "mzn",
152 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
153 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
154 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
155 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
156 "oc", "oj", "om", "or", "os", "osa", "ota",
157 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
158 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
159 "pon", "prg", "pro", "ps", "pt",
160 "qu", "quc", "qug",
161 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
162 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
163 "rw", "rwk",
164 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
165 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
166 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
167 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
168 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
169 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
170 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
171 "sv", "sw", "swb", "swc", "syc", "syr", "szl",
172 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
173 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
174 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
175 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
176 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
177 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
178 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
179 "vot", "vro", "vun",
180 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
181 "xal", "xh", "xmf", "xog",
182 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
183 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
184 "zun", "zxx", "zza",
185 NULL,
186 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
187 NULL
188 };
189
190 static const char* const DEPRECATED_LANGUAGES[]={
191 "in", "iw", "ji", "jw", NULL, NULL
192 };
193 static const char* const REPLACEMENT_LANGUAGES[]={
194 "id", "he", "yi", "jv", NULL, NULL
195 };
196
197 /**
198 * Table of 3-letter language codes.
199 *
200 * This is a lookup table used to convert 3-letter language codes to
201 * their 2-letter equivalent, where possible. It must be kept in sync
202 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
203 * same language as LANGUAGES_3[i]. The commented-out lines are
204 * copied from LANGUAGES to make eyeballing this baby easier.
205 *
206 * Where a 3-letter language code has no 2-letter equivalent, the
207 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
208 *
209 * This table should be terminated with a NULL entry, followed by a
210 * second list, and another NULL entry. The two lists correspond to
211 * the two lists in LANGUAGES.
212 */
213 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
214 /* ISO639 table version is 20150505 */
215 static const char * const LANGUAGES_3[] = {
216 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
217 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
218 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
219 "arn", "aro", "arp", "arq", "arw", "ary", "arz", "asm",
220 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
221 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
222 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
223 "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
224 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
225 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
226 "cat", "cad", "car", "cay", "cch", "che", "ceb", "cgg",
227 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
228 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
229 "ces", "csb", "chu", "chv", "cym",
230 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
231 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
232 "dyo", "dyu", "dzo", "dzg",
233 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
234 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
235 "ext",
236 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
237 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
238 "frs", "fur", "fry",
239 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
240 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
241 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
242 "gur", "guz", "glv", "gwi",
243 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
244 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
245 "hup", "hye", "her",
246 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
247 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
248 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
249 "jav",
250 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
251 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
252 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
253 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
254 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
255 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
256 "kom", "cor", "kir",
257 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
258 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
259 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
260 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
261 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
262 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
263 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
264 "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
265 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266 "mya", "mye", "myv", "mzn",
267 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
273 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274 "pon", "prg", "pro", "pus", "por",
275 "que", "quc", "qug",
276 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278 "kin", "rwk",
279 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286 "swe", "swa", "swb", "swc", "syc", "syr", "szl",
287 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
289 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
290 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
294 "vot", "vro", "vun",
295 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296 "xal", "xho", "xmf", "xog",
297 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299 "zun", "zxx", "zza",
300 NULL,
301 /* "in", "iw", "ji", "jw", "sh", */
302 "ind", "heb", "yid", "jaw", "srp",
303 NULL
304 };
305
306 /**
307 * Table of 2-letter country codes.
308 *
309 * This list must be in sorted order. This list is returned directly
310 * to the user by some API.
311 *
312 * This list must be kept in sync with COUNTRIES_3, with corresponding
313 * entries matched.
314 *
315 * This table should be terminated with a NULL entry, followed by a
316 * second list, and another NULL entry. The first list is visible to
317 * user code when this array is returned by API. The second list
318 * contains codes we support, but do not expose through user API.
319 *
320 * Notes:
321 *
322 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324 * new codes keeping the old ones for compatibility updated to include
325 * 1999/12/03 revisions *CWB*
326 *
327 * RO(ROM) is now RO(ROU) according to
328 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329 */
330 static const char * const COUNTRIES[] = {
331 "AD", "AE", "AF", "AG", "AI", "AL", "AM",
332 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
333 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
334 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
335 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
336 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR",
337 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK",
338 "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER",
339 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
340 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
341 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
342 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
343 "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
344 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
345 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
346 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
347 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
348 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
349 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
350 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
351 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
352 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
353 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
354 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
355 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
356 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
357 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
358 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
359 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
360 "WS", "YE", "YT", "ZA", "ZM", "ZW",
361 NULL,
362 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
363 NULL
364 };
365
366 static const char* const DEPRECATED_COUNTRIES[] = {
367 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
368 };
369 static const char* const REPLACEMENT_COUNTRIES[] = {
370 /* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL /* replacement country codes */
372 };
373
374 /**
375 * Table of 3-letter country codes.
376 *
377 * This is a lookup table used to convert 3-letter country codes to
378 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
379 * For all valid i, COUNTRIES[i] must refer to the same country as
380 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
381 * to make eyeballing this baby easier.
382 *
383 * This table should be terminated with a NULL entry, followed by a
384 * second list, and another NULL entry. The two lists correspond to
385 * the two lists in COUNTRIES.
386 */
387 static const char * const COUNTRIES_3[] = {
388 /* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
389 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
391 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
393 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394 /* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
395 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
397 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR", */
399 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
400 /* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK", */
401 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
402 /* "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", */
403 "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
404 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
405 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
407 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
409 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
411 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412 /* "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
413 "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
415 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
417 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
419 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
421 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
423 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
425 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
427 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
429 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
431 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
433 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
435 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436 /* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
437 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438 /* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
439 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
441 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
443 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
445 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446 /* "WS", "YE", "YT", "ZA", "ZM", "ZW", */
447 "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448 NULL,
449 /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
450 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451 NULL
452 };
453
454 typedef struct CanonicalizationMap {
455 const char *id; /* input ID */
456 const char *canonicalID; /* canonicalized output ID */
457 const char *keyword; /* keyword, or NULL if none */
458 const char *value; /* keyword value, or NULL if kw==NULL */
459 } CanonicalizationMap;
460
461 /**
462 * A map to canonicalize locale IDs. This handles a variety of
463 * different semantic kinds of transformations.
464 */
465 static const CanonicalizationMap CANONICALIZE_MAP[] = {
466 { "", "en_US_POSIX", NULL, NULL }, /* .NET name */
467 { "c", "en_US_POSIX", NULL, NULL }, /* POSIX name */
468 { "posix", "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
469 { "art_LOJBAN", "jbo", NULL, NULL }, /* registered name */
470 { "az_AZ_CYRL", "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
471 { "az_AZ_LATN", "az_Latn_AZ", NULL, NULL }, /* .NET name */
472 { "ca_ES_PREEURO", "ca_ES", "currency", "ESP" },
473 { "de__PHONEBOOK", "de", "collation", "phonebook" }, /* Old ICU name */
474 { "de_AT_PREEURO", "de_AT", "currency", "ATS" },
475 { "de_DE_PREEURO", "de_DE", "currency", "DEM" },
476 { "de_LU_PREEURO", "de_LU", "currency", "LUF" },
477 { "el_GR_PREEURO", "el_GR", "currency", "GRD" },
478 { "en_BE_PREEURO", "en_BE", "currency", "BEF" },
479 { "en_IE_PREEURO", "en_IE", "currency", "IEP" },
480 { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
481 { "es_ES_PREEURO", "es_ES", "currency", "ESP" },
482 { "eu_ES_PREEURO", "eu_ES", "currency", "ESP" },
483 { "fi_FI_PREEURO", "fi_FI", "currency", "FIM" },
484 { "fr_BE_PREEURO", "fr_BE", "currency", "BEF" },
485 { "fr_FR_PREEURO", "fr_FR", "currency", "FRF" },
486 { "fr_LU_PREEURO", "fr_LU", "currency", "LUF" },
487 { "ga_IE_PREEURO", "ga_IE", "currency", "IEP" },
488 { "gl_ES_PREEURO", "gl_ES", "currency", "ESP" },
489 { "hi__DIRECT", "hi", "collation", "direct" }, /* Old ICU name */
490 { "it_IT_PREEURO", "it_IT", "currency", "ITL" },
491 { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
492 { "nb_NO_NY", "nn_NO", NULL, NULL }, /* "markus said this was ok" :-) */
493 { "nl_BE_PREEURO", "nl_BE", "currency", "BEF" },
494 { "nl_NL_PREEURO", "nl_NL", "currency", "NLG" },
495 { "pt_PT_PREEURO", "pt_PT", "currency", "PTE" },
496 { "sr_SP_CYRL", "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
497 { "sr_SP_LATN", "sr_Latn_RS", NULL, NULL }, /* .NET name */
498 { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
499 { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
500 { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
501 { "uz_UZ_CYRL", "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
502 { "uz_UZ_LATN", "uz_Latn_UZ", NULL, NULL }, /* .NET name */
503 { "zh_CHS", "zh_Hans", NULL, NULL }, /* .NET name */
504 { "zh_CHT", "zh_Hant", NULL, NULL }, /* .NET name */
505 { "zh_GAN", "gan", NULL, NULL }, /* registered name */
506 { "zh_GUOYU", "zh", NULL, NULL }, /* registered name */
507 { "zh_HAKKA", "hak", NULL, NULL }, /* registered name */
508 { "zh_MIN_NAN", "nan", NULL, NULL }, /* registered name */
509 { "zh_WUU", "wuu", NULL, NULL }, /* registered name */
510 { "zh_XIANG", "hsn", NULL, NULL }, /* registered name */
511 { "zh_YUE", "yue", NULL, NULL }, /* registered name */
512 };
513
514 typedef struct VariantMap {
515 const char *variant; /* input ID */
516 const char *keyword; /* keyword, or NULL if none */
517 const char *value; /* keyword value, or NULL if kw==NULL */
518 } VariantMap;
519
520 static const VariantMap VARIANT_MAP[] = {
521 { "EURO", "currency", "EUR" },
522 { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
523 { "STROKE", "collation", "stroke" } /* Solaris variant */
524 };
525
526 /* ### BCP47 Conversion *******************************************/
527 /* Test if the locale id has BCP47 u extension and does not have '@' */
528 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
529 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
530 #define _ConvertBCP47(finalID, id, buffer, length,err) \
531 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
532 finalID=id; \
533 } else { \
534 finalID=buffer; \
535 }
536 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)537 static int32_t getShortestSubtagLength(const char *localeID) {
538 int32_t localeIDLength = uprv_strlen(localeID);
539 int32_t length = localeIDLength;
540 int32_t tmpLength = 0;
541 int32_t i;
542 UBool reset = TRUE;
543
544 for (i = 0; i < localeIDLength; i++) {
545 if (localeID[i] != '_' && localeID[i] != '-') {
546 if (reset) {
547 tmpLength = 0;
548 reset = FALSE;
549 }
550 tmpLength++;
551 } else {
552 if (tmpLength != 0 && tmpLength < length) {
553 length = tmpLength;
554 }
555 reset = TRUE;
556 }
557 }
558
559 return length;
560 }
561
562 /* ### Keywords **************************************************/
563
564 #define ULOC_KEYWORD_BUFFER_LEN 25
565 #define ULOC_MAX_NO_KEYWORDS 25
566
567 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)568 locale_getKeywordsStart(const char *localeID) {
569 const char *result = NULL;
570 if((result = uprv_strchr(localeID, '@')) != NULL) {
571 return result;
572 }
573 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
574 else {
575 /* We do this because the @ sign is variant, and the @ sign used on one
576 EBCDIC machine won't be compiled the same way on other EBCDIC based
577 machines. */
578 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
579 const uint8_t *charToFind = ebcdicSigns;
580 while(*charToFind) {
581 if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
582 return result;
583 }
584 charToFind++;
585 }
586 }
587 #endif
588 return NULL;
589 }
590
591 /**
592 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
593 * @param keywordName incoming name to be canonicalized
594 * @param status return status (keyword too long)
595 * @return length of the keyword name
596 */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)597 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
598 {
599 int32_t i;
600 int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
601
602 if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
603 /* keyword name too long for internal buffer */
604 *status = U_INTERNAL_PROGRAM_ERROR;
605 return 0;
606 }
607
608 /* normalize the keyword name */
609 for(i = 0; i < keywordNameLen; i++) {
610 buf[i] = uprv_tolower(keywordName[i]);
611 }
612 buf[i] = 0;
613
614 return keywordNameLen;
615 }
616
617 typedef struct {
618 char keyword[ULOC_KEYWORD_BUFFER_LEN];
619 int32_t keywordLen;
620 const char *valueStart;
621 int32_t valueLen;
622 } KeywordStruct;
623
624 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)625 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
626 const char* leftString = ((const KeywordStruct *)left)->keyword;
627 const char* rightString = ((const KeywordStruct *)right)->keyword;
628 return uprv_strcmp(leftString, rightString);
629 }
630
631 /**
632 * Both addKeyword and addValue must already be in canonical form.
633 * Either both addKeyword and addValue are NULL, or neither is NULL.
634 * If they are not NULL they must be zero terminated.
635 * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
636 */
637 static int32_t
_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,const char * addKeyword,const char * addValue,UErrorCode * status)638 _getKeywords(const char *localeID,
639 char prev,
640 char *keywords, int32_t keywordCapacity,
641 char *values, int32_t valuesCapacity, int32_t *valLen,
642 UBool valuesToo,
643 const char* addKeyword,
644 const char* addValue,
645 UErrorCode *status)
646 {
647 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
648
649 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
650 int32_t numKeywords = 0;
651 const char* pos = localeID;
652 const char* equalSign = NULL;
653 const char* semicolon = NULL;
654 int32_t i = 0, j, n;
655 int32_t keywordsLen = 0;
656 int32_t valuesLen = 0;
657
658 if(prev == '@') { /* start of keyword definition */
659 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
660 do {
661 UBool duplicate = FALSE;
662 /* skip leading spaces */
663 while(*pos == ' ') {
664 pos++;
665 }
666 if (!*pos) { /* handle trailing "; " */
667 break;
668 }
669 if(numKeywords == maxKeywords) {
670 *status = U_INTERNAL_PROGRAM_ERROR;
671 return 0;
672 }
673 equalSign = uprv_strchr(pos, '=');
674 semicolon = uprv_strchr(pos, ';');
675 /* lack of '=' [foo@currency] is illegal */
676 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
677 if(!equalSign || (semicolon && semicolon<equalSign)) {
678 *status = U_INVALID_FORMAT_ERROR;
679 return 0;
680 }
681 /* need to normalize both keyword and keyword name */
682 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
683 /* keyword name too long for internal buffer */
684 *status = U_INTERNAL_PROGRAM_ERROR;
685 return 0;
686 }
687 for(i = 0, n = 0; i < equalSign - pos; ++i) {
688 if (pos[i] != ' ') {
689 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
690 }
691 }
692
693 /* zero-length keyword is an error. */
694 if (n == 0) {
695 *status = U_INVALID_FORMAT_ERROR;
696 return 0;
697 }
698
699 keywordList[numKeywords].keyword[n] = 0;
700 keywordList[numKeywords].keywordLen = n;
701 /* now grab the value part. First we skip the '=' */
702 equalSign++;
703 /* then we leading spaces */
704 while(*equalSign == ' ') {
705 equalSign++;
706 }
707
708 /* Premature end or zero-length value */
709 if (!*equalSign || equalSign == semicolon) {
710 *status = U_INVALID_FORMAT_ERROR;
711 return 0;
712 }
713
714 keywordList[numKeywords].valueStart = equalSign;
715
716 pos = semicolon;
717 i = 0;
718 if(pos) {
719 while(*(pos - i - 1) == ' ') {
720 i++;
721 }
722 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
723 pos++;
724 } else {
725 i = (int32_t)uprv_strlen(equalSign);
726 while(i && equalSign[i-1] == ' ') {
727 i--;
728 }
729 keywordList[numKeywords].valueLen = i;
730 }
731 /* If this is a duplicate keyword, then ignore it */
732 for (j=0; j<numKeywords; ++j) {
733 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
734 duplicate = TRUE;
735 break;
736 }
737 }
738 if (!duplicate) {
739 ++numKeywords;
740 }
741 } while(pos);
742
743 /* Handle addKeyword/addValue. */
744 if (addKeyword != NULL) {
745 UBool duplicate = FALSE;
746 U_ASSERT(addValue != NULL);
747 /* Search for duplicate; if found, do nothing. Explicit keyword
748 overrides addKeyword. */
749 for (j=0; j<numKeywords; ++j) {
750 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
751 duplicate = TRUE;
752 break;
753 }
754 }
755 if (!duplicate) {
756 if (numKeywords == maxKeywords) {
757 *status = U_INTERNAL_PROGRAM_ERROR;
758 return 0;
759 }
760 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
761 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
762 keywordList[numKeywords].valueStart = addValue;
763 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
764 ++numKeywords;
765 }
766 } else {
767 U_ASSERT(addValue == NULL);
768 }
769
770 /* now we have a list of keywords */
771 /* we need to sort it */
772 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
773
774 /* Now construct the keyword part */
775 for(i = 0; i < numKeywords; i++) {
776 if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
777 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
778 if(valuesToo) {
779 keywords[keywordsLen + keywordList[i].keywordLen] = '=';
780 } else {
781 keywords[keywordsLen + keywordList[i].keywordLen] = 0;
782 }
783 }
784 keywordsLen += keywordList[i].keywordLen + 1;
785 if(valuesToo) {
786 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
787 uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
788 }
789 keywordsLen += keywordList[i].valueLen;
790
791 if(i < numKeywords - 1) {
792 if(keywordsLen < keywordCapacity) {
793 keywords[keywordsLen] = ';';
794 }
795 keywordsLen++;
796 }
797 }
798 if(values) {
799 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
800 uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
801 values[valuesLen + keywordList[i].valueLen] = 0;
802 }
803 valuesLen += keywordList[i].valueLen + 1;
804 }
805 }
806 if(values) {
807 values[valuesLen] = 0;
808 if(valLen) {
809 *valLen = valuesLen;
810 }
811 }
812 return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
813 } else {
814 return 0;
815 }
816 }
817
818 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)819 locale_getKeywords(const char *localeID,
820 char prev,
821 char *keywords, int32_t keywordCapacity,
822 char *values, int32_t valuesCapacity, int32_t *valLen,
823 UBool valuesToo,
824 UErrorCode *status) {
825 return _getKeywords(localeID, prev, keywords, keywordCapacity,
826 values, valuesCapacity, valLen, valuesToo,
827 NULL, NULL, status);
828 }
829
830 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)831 uloc_getKeywordValue(const char* localeID,
832 const char* keywordName,
833 char* buffer, int32_t bufferCapacity,
834 UErrorCode* status)
835 {
836 const char* startSearchHere = NULL;
837 const char* nextSeparator = NULL;
838 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
839 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
840 int32_t i = 0;
841 int32_t result = 0;
842
843 if(status && U_SUCCESS(*status) && localeID) {
844 char tempBuffer[ULOC_FULLNAME_CAPACITY];
845 const char* tmpLocaleID;
846
847 if (_hasBCP47Extension(localeID)) {
848 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
849 } else {
850 tmpLocaleID=localeID;
851 }
852
853 startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
854 if(startSearchHere == NULL) {
855 /* no keywords, return at once */
856 return 0;
857 }
858
859 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
860 if(U_FAILURE(*status)) {
861 return 0;
862 }
863
864 /* find the first keyword */
865 while(startSearchHere) {
866 startSearchHere++;
867 /* skip leading spaces (allowed?) */
868 while(*startSearchHere == ' ') {
869 startSearchHere++;
870 }
871 nextSeparator = uprv_strchr(startSearchHere, '=');
872 /* need to normalize both keyword and keyword name */
873 if(!nextSeparator) {
874 break;
875 }
876 if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
877 /* keyword name too long for internal buffer */
878 *status = U_INTERNAL_PROGRAM_ERROR;
879 return 0;
880 }
881 for(i = 0; i < nextSeparator - startSearchHere; i++) {
882 localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
883 }
884 /* trim trailing spaces */
885 while(startSearchHere[i-1] == ' ') {
886 i--;
887 U_ASSERT(i>=0);
888 }
889 localeKeywordNameBuffer[i] = 0;
890
891 startSearchHere = uprv_strchr(nextSeparator, ';');
892
893 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
894 nextSeparator++;
895 while(*nextSeparator == ' ') {
896 nextSeparator++;
897 }
898 /* we actually found the keyword. Copy the value */
899 if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
900 while(*(startSearchHere-1) == ' ') {
901 startSearchHere--;
902 }
903 uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
904 result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
905 } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
906 i = (int32_t)uprv_strlen(nextSeparator);
907 while(nextSeparator[i - 1] == ' ') {
908 i--;
909 }
910 uprv_strncpy(buffer, nextSeparator, i);
911 result = u_terminateChars(buffer, bufferCapacity, i, status);
912 } else {
913 /* give a bigger buffer, please */
914 *status = U_BUFFER_OVERFLOW_ERROR;
915 if(startSearchHere) {
916 result = (int32_t)(startSearchHere - nextSeparator);
917 } else {
918 result = (int32_t)uprv_strlen(nextSeparator);
919 }
920 }
921 return result;
922 }
923 }
924 }
925 return 0;
926 }
927
928 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)929 uloc_setKeywordValue(const char* keywordName,
930 const char* keywordValue,
931 char* buffer, int32_t bufferCapacity,
932 UErrorCode* status)
933 {
934 /* TODO: sorting. removal. */
935 int32_t keywordNameLen;
936 int32_t keywordValueLen;
937 int32_t bufLen;
938 int32_t needLen = 0;
939 int32_t foundValueLen;
940 int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
941 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
942 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
943 int32_t i = 0;
944 int32_t rc;
945 char* nextSeparator = NULL;
946 char* nextEqualsign = NULL;
947 char* startSearchHere = NULL;
948 char* keywordStart = NULL;
949 char *insertHere = NULL;
950 if(U_FAILURE(*status)) {
951 return -1;
952 }
953 if(bufferCapacity>1) {
954 bufLen = (int32_t)uprv_strlen(buffer);
955 } else {
956 *status = U_ILLEGAL_ARGUMENT_ERROR;
957 return 0;
958 }
959 if(bufferCapacity<bufLen) {
960 /* The capacity is less than the length?! Is this NULL terminated? */
961 *status = U_ILLEGAL_ARGUMENT_ERROR;
962 return 0;
963 }
964 if(keywordValue && !*keywordValue) {
965 keywordValue = NULL;
966 }
967 if(keywordValue) {
968 keywordValueLen = (int32_t)uprv_strlen(keywordValue);
969 } else {
970 keywordValueLen = 0;
971 }
972 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
973 if(U_FAILURE(*status)) {
974 return 0;
975 }
976 startSearchHere = (char*)locale_getKeywordsStart(buffer);
977 if(startSearchHere == NULL || (startSearchHere[1]==0)) {
978 if(!keywordValue) { /* no keywords = nothing to remove */
979 return bufLen;
980 }
981
982 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
983 if(startSearchHere) { /* had a single @ */
984 needLen--; /* already had the @ */
985 /* startSearchHere points at the @ */
986 } else {
987 startSearchHere=buffer+bufLen;
988 }
989 if(needLen >= bufferCapacity) {
990 *status = U_BUFFER_OVERFLOW_ERROR;
991 return needLen; /* no change */
992 }
993 *startSearchHere = '@';
994 startSearchHere++;
995 uprv_strcpy(startSearchHere, keywordNameBuffer);
996 startSearchHere += keywordNameLen;
997 *startSearchHere = '=';
998 startSearchHere++;
999 uprv_strcpy(startSearchHere, keywordValue);
1000 startSearchHere+=keywordValueLen;
1001 return needLen;
1002 } /* end shortcut - no @ */
1003
1004 keywordStart = startSearchHere;
1005 /* search for keyword */
1006 while(keywordStart) {
1007 keywordStart++;
1008 /* skip leading spaces (allowed?) */
1009 while(*keywordStart == ' ') {
1010 keywordStart++;
1011 }
1012 nextEqualsign = uprv_strchr(keywordStart, '=');
1013 /* need to normalize both keyword and keyword name */
1014 if(!nextEqualsign) {
1015 break;
1016 }
1017 if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1018 /* keyword name too long for internal buffer */
1019 *status = U_INTERNAL_PROGRAM_ERROR;
1020 return 0;
1021 }
1022 for(i = 0; i < nextEqualsign - keywordStart; i++) {
1023 localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1024 }
1025 /* trim trailing spaces */
1026 while(keywordStart[i-1] == ' ') {
1027 i--;
1028 }
1029 U_ASSERT(i>=0 && i<ULOC_KEYWORD_BUFFER_LEN);
1030 localeKeywordNameBuffer[i] = 0;
1031
1032 nextSeparator = uprv_strchr(nextEqualsign, ';');
1033 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1034 if(rc == 0) {
1035 nextEqualsign++;
1036 while(*nextEqualsign == ' ') {
1037 nextEqualsign++;
1038 }
1039 /* we actually found the keyword. Change the value */
1040 if (nextSeparator) {
1041 keywordAtEnd = 0;
1042 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1043 } else {
1044 keywordAtEnd = 1;
1045 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1046 }
1047 if(keywordValue) { /* adding a value - not removing */
1048 if(foundValueLen == keywordValueLen) {
1049 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1050 return bufLen; /* no change in size */
1051 } else if(foundValueLen > keywordValueLen) {
1052 int32_t delta = foundValueLen - keywordValueLen;
1053 if(nextSeparator) { /* RH side */
1054 uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1055 }
1056 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1057 bufLen -= delta;
1058 buffer[bufLen]=0;
1059 return bufLen;
1060 } else { /* FVL < KVL */
1061 int32_t delta = keywordValueLen - foundValueLen;
1062 if((bufLen+delta) >= bufferCapacity) {
1063 *status = U_BUFFER_OVERFLOW_ERROR;
1064 return bufLen+delta;
1065 }
1066 if(nextSeparator) { /* RH side */
1067 uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1068 }
1069 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1070 bufLen += delta;
1071 buffer[bufLen]=0;
1072 return bufLen;
1073 }
1074 } else { /* removing a keyword */
1075 if(keywordAtEnd) {
1076 /* zero out the ';' or '@' just before startSearchhere */
1077 keywordStart[-1] = 0;
1078 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1079 } else {
1080 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1081 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1082 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1083 }
1084 }
1085 } else if(rc<0){ /* end match keyword */
1086 /* could insert at this location. */
1087 insertHere = keywordStart;
1088 }
1089 keywordStart = nextSeparator;
1090 } /* end loop searching */
1091
1092 if(!keywordValue) {
1093 return bufLen; /* removal of non-extant keyword - no change */
1094 }
1095
1096 /* we know there is at least one keyword. */
1097 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1098 if(needLen >= bufferCapacity) {
1099 *status = U_BUFFER_OVERFLOW_ERROR;
1100 return needLen; /* no change */
1101 }
1102
1103 if(insertHere) {
1104 uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1105 keywordStart = insertHere;
1106 } else {
1107 keywordStart = buffer+bufLen;
1108 *keywordStart = ';';
1109 keywordStart++;
1110 }
1111 uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1112 keywordStart += keywordNameLen;
1113 *keywordStart = '=';
1114 keywordStart++;
1115 uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1116 keywordStart+=keywordValueLen;
1117 if(insertHere) {
1118 *keywordStart = ';';
1119 keywordStart++;
1120 }
1121 buffer[needLen]=0;
1122 return needLen;
1123 }
1124
1125 /* ### ID parsing implementation **************************************************/
1126
1127 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1128
1129 /*returns TRUE if one of the special prefixes is here (s=string)
1130 'x-' or 'i-' */
1131 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1132
1133 /* Dot terminates it because of POSIX form where dot precedes the codepage
1134 * except for variant
1135 */
1136 #define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1137
_strnchr(const char * str,int32_t len,char c)1138 static char* _strnchr(const char* str, int32_t len, char c) {
1139 U_ASSERT(str != 0 && len >= 0);
1140 while (len-- != 0) {
1141 char d = *str;
1142 if (d == c) {
1143 return (char*) str;
1144 } else if (d == 0) {
1145 break;
1146 }
1147 ++str;
1148 }
1149 return NULL;
1150 }
1151
1152 /**
1153 * Lookup 'key' in the array 'list'. The array 'list' should contain
1154 * a NULL entry, followed by more entries, and a second NULL entry.
1155 *
1156 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1157 * COUNTRIES_3.
1158 */
_findIndex(const char * const * list,const char * key)1159 static int16_t _findIndex(const char* const* list, const char* key)
1160 {
1161 const char* const* anchor = list;
1162 int32_t pass = 0;
1163
1164 /* Make two passes through two NULL-terminated arrays at 'list' */
1165 while (pass++ < 2) {
1166 while (*list) {
1167 if (uprv_strcmp(key, *list) == 0) {
1168 return (int16_t)(list - anchor);
1169 }
1170 list++;
1171 }
1172 ++list; /* skip final NULL *CWB*/
1173 }
1174 return -1;
1175 }
1176
1177 /* count the length of src while copying it to dest; return strlen(src) */
1178 static inline int32_t
_copyCount(char * dest,int32_t destCapacity,const char * src)1179 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1180 const char *anchor;
1181 char c;
1182
1183 anchor=src;
1184 for(;;) {
1185 if((c=*src)==0) {
1186 return (int32_t)(src-anchor);
1187 }
1188 if(destCapacity<=0) {
1189 return (int32_t)((src-anchor)+uprv_strlen(src));
1190 }
1191 ++src;
1192 *dest++=c;
1193 --destCapacity;
1194 }
1195 }
1196
1197 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1198 uloc_getCurrentCountryID(const char* oldID){
1199 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1200 if (offset >= 0) {
1201 return REPLACEMENT_COUNTRIES[offset];
1202 }
1203 return oldID;
1204 }
1205 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1206 uloc_getCurrentLanguageID(const char* oldID){
1207 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1208 if (offset >= 0) {
1209 return REPLACEMENT_LANGUAGES[offset];
1210 }
1211 return oldID;
1212 }
1213 /*
1214 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1215 * avoid duplicating code to handle the earlier locale ID pieces
1216 * in the functions for the later ones by
1217 * setting the *pEnd pointer to where they stopped parsing
1218 *
1219 * TODO try to use this in Locale
1220 */
1221 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1222 ulocimp_getLanguage(const char *localeID,
1223 char *language, int32_t languageCapacity,
1224 const char **pEnd) {
1225 int32_t i=0;
1226 int32_t offset;
1227 char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1228
1229 /* if it starts with i- or x- then copy that prefix */
1230 if(_isIDPrefix(localeID)) {
1231 if(i<languageCapacity) {
1232 language[i]=(char)uprv_tolower(*localeID);
1233 }
1234 if(i<languageCapacity) {
1235 language[i+1]='-';
1236 }
1237 i+=2;
1238 localeID+=2;
1239 }
1240
1241 /* copy the language as far as possible and count its length */
1242 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1243 if(i<languageCapacity) {
1244 language[i]=(char)uprv_tolower(*localeID);
1245 }
1246 if(i<3) {
1247 U_ASSERT(i>=0);
1248 lang[i]=(char)uprv_tolower(*localeID);
1249 }
1250 i++;
1251 localeID++;
1252 }
1253
1254 if(i==3) {
1255 /* convert 3 character code to 2 character code if possible *CWB*/
1256 offset=_findIndex(LANGUAGES_3, lang);
1257 if(offset>=0) {
1258 i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1259 }
1260 }
1261
1262 if(pEnd!=NULL) {
1263 *pEnd=localeID;
1264 }
1265 return i;
1266 }
1267
1268 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1269 ulocimp_getScript(const char *localeID,
1270 char *script, int32_t scriptCapacity,
1271 const char **pEnd)
1272 {
1273 int32_t idLen = 0;
1274
1275 if (pEnd != NULL) {
1276 *pEnd = localeID;
1277 }
1278
1279 /* copy the second item as far as possible and count its length */
1280 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1281 && uprv_isASCIILetter(localeID[idLen])) {
1282 idLen++;
1283 }
1284
1285 /* If it's exactly 4 characters long, then it's a script and not a country. */
1286 if (idLen == 4) {
1287 int32_t i;
1288 if (pEnd != NULL) {
1289 *pEnd = localeID+idLen;
1290 }
1291 if(idLen > scriptCapacity) {
1292 idLen = scriptCapacity;
1293 }
1294 if (idLen >= 1) {
1295 script[0]=(char)uprv_toupper(*(localeID++));
1296 }
1297 for (i = 1; i < idLen; i++) {
1298 script[i]=(char)uprv_tolower(*(localeID++));
1299 }
1300 }
1301 else {
1302 idLen = 0;
1303 }
1304 return idLen;
1305 }
1306
1307 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1308 ulocimp_getCountry(const char *localeID,
1309 char *country, int32_t countryCapacity,
1310 const char **pEnd)
1311 {
1312 int32_t idLen=0;
1313 char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1314 int32_t offset;
1315
1316 /* copy the country as far as possible and count its length */
1317 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1318 if(idLen<(ULOC_COUNTRY_CAPACITY-1)) { /*CWB*/
1319 cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1320 }
1321 idLen++;
1322 }
1323
1324 /* the country should be either length 2 or 3 */
1325 if (idLen == 2 || idLen == 3) {
1326 UBool gotCountry = FALSE;
1327 /* convert 3 character code to 2 character code if possible *CWB*/
1328 if(idLen==3) {
1329 offset=_findIndex(COUNTRIES_3, cnty);
1330 if(offset>=0) {
1331 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1332 gotCountry = TRUE;
1333 }
1334 }
1335 if (!gotCountry) {
1336 int32_t i = 0;
1337 for (i = 0; i < idLen; i++) {
1338 if (i < countryCapacity) {
1339 country[i]=(char)uprv_toupper(localeID[i]);
1340 }
1341 }
1342 }
1343 localeID+=idLen;
1344 } else {
1345 idLen = 0;
1346 }
1347
1348 if(pEnd!=NULL) {
1349 *pEnd=localeID;
1350 }
1351
1352 return idLen;
1353 }
1354
1355 /**
1356 * @param needSeparator if true, then add leading '_' if any variants
1357 * are added to 'variant'
1358 */
1359 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1360 _getVariantEx(const char *localeID,
1361 char prev,
1362 char *variant, int32_t variantCapacity,
1363 UBool needSeparator) {
1364 int32_t i=0;
1365
1366 /* get one or more variant tags and separate them with '_' */
1367 if(_isIDSeparator(prev)) {
1368 /* get a variant string after a '-' or '_' */
1369 while(!_isTerminator(*localeID)) {
1370 if (needSeparator) {
1371 if (i<variantCapacity) {
1372 variant[i] = '_';
1373 }
1374 ++i;
1375 needSeparator = FALSE;
1376 }
1377 if(i<variantCapacity) {
1378 variant[i]=(char)uprv_toupper(*localeID);
1379 if(variant[i]=='-') {
1380 variant[i]='_';
1381 }
1382 }
1383 i++;
1384 localeID++;
1385 }
1386 }
1387
1388 /* if there is no variant tag after a '-' or '_' then look for '@' */
1389 if(i==0) {
1390 if(prev=='@') {
1391 /* keep localeID */
1392 } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1393 ++localeID; /* point after the '@' */
1394 } else {
1395 return 0;
1396 }
1397 while(!_isTerminator(*localeID)) {
1398 if (needSeparator) {
1399 if (i<variantCapacity) {
1400 variant[i] = '_';
1401 }
1402 ++i;
1403 needSeparator = FALSE;
1404 }
1405 if(i<variantCapacity) {
1406 variant[i]=(char)uprv_toupper(*localeID);
1407 if(variant[i]=='-' || variant[i]==',') {
1408 variant[i]='_';
1409 }
1410 }
1411 i++;
1412 localeID++;
1413 }
1414 }
1415
1416 return i;
1417 }
1418
1419 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1420 _getVariant(const char *localeID,
1421 char prev,
1422 char *variant, int32_t variantCapacity) {
1423 return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1424 }
1425
1426 /**
1427 * Delete ALL instances of a variant from the given list of one or
1428 * more variants. Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1429 * @param variants the source string of one or more variants,
1430 * separated by '_'. This will be MODIFIED IN PLACE. Not zero
1431 * terminated; if it is, trailing zero will NOT be maintained.
1432 * @param variantsLen length of variants
1433 * @param toDelete variant to delete, without separators, e.g. "EURO"
1434 * or "PREEURO"; not zero terminated
1435 * @param toDeleteLen length of toDelete
1436 * @return number of characters deleted from variants
1437 */
1438 static int32_t
_deleteVariant(char * variants,int32_t variantsLen,const char * toDelete,int32_t toDeleteLen)1439 _deleteVariant(char* variants, int32_t variantsLen,
1440 const char* toDelete, int32_t toDeleteLen)
1441 {
1442 int32_t delta = 0; /* number of chars deleted */
1443 for (;;) {
1444 UBool flag = FALSE;
1445 if (variantsLen < toDeleteLen) {
1446 return delta;
1447 }
1448 if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1449 (variantsLen == toDeleteLen ||
1450 (flag=(variants[toDeleteLen] == '_'))))
1451 {
1452 int32_t d = toDeleteLen + (flag?1:0);
1453 variantsLen -= d;
1454 delta += d;
1455 if (variantsLen > 0) {
1456 uprv_memmove(variants, variants+d, variantsLen);
1457 }
1458 } else {
1459 char* p = _strnchr(variants, variantsLen, '_');
1460 if (p == NULL) {
1461 return delta;
1462 }
1463 ++p;
1464 variantsLen -= (int32_t)(p - variants);
1465 variants = p;
1466 }
1467 }
1468 }
1469
1470 /* Keyword enumeration */
1471
1472 typedef struct UKeywordsContext {
1473 char* keywords;
1474 char* current;
1475 } UKeywordsContext;
1476
1477 U_CDECL_BEGIN
1478
1479 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1480 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1481 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1482 uprv_free(enumerator->context);
1483 uprv_free(enumerator);
1484 }
1485
1486 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1487 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1488 char *kw = ((UKeywordsContext *)en->context)->keywords;
1489 int32_t result = 0;
1490 while(*kw) {
1491 result++;
1492 kw += uprv_strlen(kw)+1;
1493 }
1494 return result;
1495 }
1496
1497 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1498 uloc_kw_nextKeyword(UEnumeration* en,
1499 int32_t* resultLength,
1500 UErrorCode* /*status*/) {
1501 const char* result = ((UKeywordsContext *)en->context)->current;
1502 int32_t len = 0;
1503 if(*result) {
1504 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1505 ((UKeywordsContext *)en->context)->current += len+1;
1506 } else {
1507 result = NULL;
1508 }
1509 if (resultLength) {
1510 *resultLength = len;
1511 }
1512 return result;
1513 }
1514
1515 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1516 uloc_kw_resetKeywords(UEnumeration* en,
1517 UErrorCode* /*status*/) {
1518 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1519 }
1520
1521 U_CDECL_END
1522
1523
1524 static const UEnumeration gKeywordsEnum = {
1525 NULL,
1526 NULL,
1527 uloc_kw_closeKeywords,
1528 uloc_kw_countKeywords,
1529 uenum_unextDefault,
1530 uloc_kw_nextKeyword,
1531 uloc_kw_resetKeywords
1532 };
1533
1534 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1535 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1536 {
1537 UKeywordsContext *myContext = NULL;
1538 UEnumeration *result = NULL;
1539
1540 if(U_FAILURE(*status)) {
1541 return NULL;
1542 }
1543 result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1544 /* Null pointer test */
1545 if (result == NULL) {
1546 *status = U_MEMORY_ALLOCATION_ERROR;
1547 return NULL;
1548 }
1549 uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1550 myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1551 if (myContext == NULL) {
1552 *status = U_MEMORY_ALLOCATION_ERROR;
1553 uprv_free(result);
1554 return NULL;
1555 }
1556 myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1557 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1558 myContext->keywords[keywordListSize] = 0;
1559 myContext->current = myContext->keywords;
1560 result->context = myContext;
1561 return result;
1562 }
1563
1564 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1565 uloc_openKeywords(const char* localeID,
1566 UErrorCode* status)
1567 {
1568 int32_t i=0;
1569 char keywords[256];
1570 int32_t keywordsCapacity = 256;
1571 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1572 const char* tmpLocaleID;
1573
1574 if(status==NULL || U_FAILURE(*status)) {
1575 return 0;
1576 }
1577
1578 if (_hasBCP47Extension(localeID)) {
1579 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1580 } else {
1581 if (localeID==NULL) {
1582 localeID=uloc_getDefault();
1583 }
1584 tmpLocaleID=localeID;
1585 }
1586
1587 /* Skip the language */
1588 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1589 if(_isIDSeparator(*tmpLocaleID)) {
1590 const char *scriptID;
1591 /* Skip the script if available */
1592 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1593 if(scriptID != tmpLocaleID+1) {
1594 /* Found optional script */
1595 tmpLocaleID = scriptID;
1596 }
1597 /* Skip the Country */
1598 if (_isIDSeparator(*tmpLocaleID)) {
1599 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1600 if(_isIDSeparator(*tmpLocaleID)) {
1601 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1602 }
1603 }
1604 }
1605
1606 /* keywords are located after '@' */
1607 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1608 i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1609 }
1610
1611 if(i) {
1612 return uloc_openKeywordList(keywords, i, status);
1613 } else {
1614 return NULL;
1615 }
1616 }
1617
1618
1619 /* bit-flags for 'options' parameter of _canonicalize */
1620 #define _ULOC_STRIP_KEYWORDS 0x2
1621 #define _ULOC_CANONICALIZE 0x1
1622
1623 #define OPTION_SET(options, mask) ((options & mask) != 0)
1624
1625 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1626 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1627
1628 /**
1629 * Canonicalize the given localeID, to level 1 or to level 2,
1630 * depending on the options. To specify level 1, pass in options=0.
1631 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1632 *
1633 * This is the code underlying uloc_getName and uloc_canonicalize.
1634 */
1635 static int32_t
_canonicalize(const char * localeID,char * result,int32_t resultCapacity,uint32_t options,UErrorCode * err)1636 _canonicalize(const char* localeID,
1637 char* result,
1638 int32_t resultCapacity,
1639 uint32_t options,
1640 UErrorCode* err) {
1641 int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1642 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1643 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1644 const char* origLocaleID;
1645 const char* tmpLocaleID;
1646 const char* keywordAssign = NULL;
1647 const char* separatorIndicator = NULL;
1648 const char* addKeyword = NULL;
1649 const char* addValue = NULL;
1650 char* name;
1651 char* variant = NULL; /* pointer into name, or NULL */
1652
1653 if (U_FAILURE(*err)) {
1654 return 0;
1655 }
1656
1657 if (_hasBCP47Extension(localeID)) {
1658 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1659 } else {
1660 if (localeID==NULL) {
1661 localeID=uloc_getDefault();
1662 }
1663 tmpLocaleID=localeID;
1664 }
1665
1666 origLocaleID=tmpLocaleID;
1667
1668 /* if we are doing a full canonicalization, then put results in
1669 localeBuffer, if necessary; otherwise send them to result. */
1670 if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1671 (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1672 name = localeBuffer;
1673 nameCapacity = (int32_t)sizeof(localeBuffer);
1674 } else {
1675 name = result;
1676 nameCapacity = resultCapacity;
1677 }
1678
1679 /* get all pieces, one after another, and separate with '_' */
1680 len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1681
1682 if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1683 const char *d = uloc_getDefault();
1684
1685 len = (int32_t)uprv_strlen(d);
1686
1687 if (name != NULL) {
1688 uprv_strncpy(name, d, len);
1689 }
1690 } else if(_isIDSeparator(*tmpLocaleID)) {
1691 const char *scriptID;
1692
1693 ++fieldCount;
1694 if(len<nameCapacity) {
1695 name[len]='_';
1696 }
1697 ++len;
1698
1699 scriptSize=ulocimp_getScript(tmpLocaleID+1,
1700 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1701 if(scriptSize > 0) {
1702 /* Found optional script */
1703 tmpLocaleID = scriptID;
1704 ++fieldCount;
1705 len+=scriptSize;
1706 if (_isIDSeparator(*tmpLocaleID)) {
1707 /* If there is something else, then we add the _ */
1708 if(len<nameCapacity) {
1709 name[len]='_';
1710 }
1711 ++len;
1712 }
1713 }
1714
1715 if (_isIDSeparator(*tmpLocaleID)) {
1716 const char *cntryID;
1717 int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1718 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1719 if (cntrySize > 0) {
1720 /* Found optional country */
1721 tmpLocaleID = cntryID;
1722 len+=cntrySize;
1723 }
1724 if(_isIDSeparator(*tmpLocaleID)) {
1725 /* If there is something else, then we add the _ if we found country before. */
1726 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1727 ++fieldCount;
1728 if(len<nameCapacity) {
1729 name[len]='_';
1730 }
1731 ++len;
1732 }
1733
1734 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1735 (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1736 if (variantSize > 0) {
1737 variant = len<nameCapacity ? name+len : NULL;
1738 len += variantSize;
1739 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1740 }
1741 }
1742 }
1743 }
1744
1745 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1746 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1747 UBool done = FALSE;
1748 do {
1749 char c = *tmpLocaleID;
1750 switch (c) {
1751 case 0:
1752 case '@':
1753 done = TRUE;
1754 break;
1755 default:
1756 if (len<nameCapacity) {
1757 name[len] = c;
1758 }
1759 ++len;
1760 ++tmpLocaleID;
1761 break;
1762 }
1763 } while (!done);
1764 }
1765
1766 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1767 After this, tmpLocaleID either points to '@' or is NULL */
1768 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1769 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1770 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1771 }
1772
1773 /* Copy POSIX-style variant, if any [mr@FOO] */
1774 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1775 tmpLocaleID != NULL && keywordAssign == NULL) {
1776 for (;;) {
1777 char c = *tmpLocaleID;
1778 if (c == 0) {
1779 break;
1780 }
1781 if (len<nameCapacity) {
1782 name[len] = c;
1783 }
1784 ++len;
1785 ++tmpLocaleID;
1786 }
1787 }
1788
1789 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1790 /* Handle @FOO variant if @ is present and not followed by = */
1791 if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1792 int32_t posixVariantSize;
1793 /* Add missing '_' if needed */
1794 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1795 do {
1796 if(len<nameCapacity) {
1797 name[len]='_';
1798 }
1799 ++len;
1800 ++fieldCount;
1801 } while(fieldCount<2);
1802 }
1803 posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1804 (UBool)(variantSize > 0));
1805 if (posixVariantSize > 0) {
1806 if (variant == NULL) {
1807 variant = name+len;
1808 }
1809 len += posixVariantSize;
1810 variantSize += posixVariantSize;
1811 }
1812 }
1813
1814 /* Handle generic variants first */
1815 if (variant) {
1816 for (j=0; j<UPRV_LENGTHOF(VARIANT_MAP); j++) {
1817 const char* variantToCompare = VARIANT_MAP[j].variant;
1818 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1819 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1820 len -= variantLen;
1821 if (variantLen > 0) {
1822 if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1823 --len;
1824 }
1825 addKeyword = VARIANT_MAP[j].keyword;
1826 addValue = VARIANT_MAP[j].value;
1827 break;
1828 }
1829 }
1830 if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1831 --len;
1832 }
1833 }
1834
1835 /* Look up the ID in the canonicalization map */
1836 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1837 const char* id = CANONICALIZE_MAP[j].id;
1838 int32_t n = (int32_t)uprv_strlen(id);
1839 if (len == n && uprv_strncmp(name, id, n) == 0) {
1840 if (n == 0 && tmpLocaleID != NULL) {
1841 break; /* Don't remap "" if keywords present */
1842 }
1843 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1844 if (CANONICALIZE_MAP[j].keyword) {
1845 addKeyword = CANONICALIZE_MAP[j].keyword;
1846 addValue = CANONICALIZE_MAP[j].value;
1847 }
1848 break;
1849 }
1850 }
1851 }
1852
1853 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1854 if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1855 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1856 if(len<nameCapacity) {
1857 name[len]='@';
1858 }
1859 ++len;
1860 ++fieldCount;
1861 len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1862 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1863 } else if (addKeyword != NULL) {
1864 U_ASSERT(addValue != NULL && len < nameCapacity);
1865 /* inelegant but works -- later make _getKeywords do this? */
1866 len += _copyCount(name+len, nameCapacity-len, "@");
1867 len += _copyCount(name+len, nameCapacity-len, addKeyword);
1868 len += _copyCount(name+len, nameCapacity-len, "=");
1869 len += _copyCount(name+len, nameCapacity-len, addValue);
1870 }
1871 }
1872
1873 if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1874 uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1875 }
1876
1877 return u_terminateChars(result, resultCapacity, len, err);
1878 }
1879
1880 /* ### ID parsing API **************************************************/
1881
1882 U_CAPI int32_t U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1883 uloc_getParent(const char* localeID,
1884 char* parent,
1885 int32_t parentCapacity,
1886 UErrorCode* err)
1887 {
1888 const char *lastUnderscore;
1889 int32_t i;
1890
1891 if (U_FAILURE(*err))
1892 return 0;
1893
1894 if (localeID == NULL)
1895 localeID = uloc_getDefault();
1896
1897 lastUnderscore=uprv_strrchr(localeID, '_');
1898 if(lastUnderscore!=NULL) {
1899 i=(int32_t)(lastUnderscore-localeID);
1900 } else {
1901 i=0;
1902 }
1903
1904 if(i>0 && parent != localeID) {
1905 uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1906 }
1907 return u_terminateChars(parent, parentCapacity, i, err);
1908 }
1909
1910 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1911 uloc_getLanguage(const char* localeID,
1912 char* language,
1913 int32_t languageCapacity,
1914 UErrorCode* err)
1915 {
1916 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1917 int32_t i=0;
1918
1919 if (err==NULL || U_FAILURE(*err)) {
1920 return 0;
1921 }
1922
1923 if(localeID==NULL) {
1924 localeID=uloc_getDefault();
1925 }
1926
1927 i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1928 return u_terminateChars(language, languageCapacity, i, err);
1929 }
1930
1931 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1932 uloc_getScript(const char* localeID,
1933 char* script,
1934 int32_t scriptCapacity,
1935 UErrorCode* err)
1936 {
1937 int32_t i=0;
1938
1939 if(err==NULL || U_FAILURE(*err)) {
1940 return 0;
1941 }
1942
1943 if(localeID==NULL) {
1944 localeID=uloc_getDefault();
1945 }
1946
1947 /* skip the language */
1948 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1949 if(_isIDSeparator(*localeID)) {
1950 i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1951 }
1952 return u_terminateChars(script, scriptCapacity, i, err);
1953 }
1954
1955 U_CAPI int32_t U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1956 uloc_getCountry(const char* localeID,
1957 char* country,
1958 int32_t countryCapacity,
1959 UErrorCode* err)
1960 {
1961 int32_t i=0;
1962
1963 if(err==NULL || U_FAILURE(*err)) {
1964 return 0;
1965 }
1966
1967 if(localeID==NULL) {
1968 localeID=uloc_getDefault();
1969 }
1970
1971 /* Skip the language */
1972 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1973 if(_isIDSeparator(*localeID)) {
1974 const char *scriptID;
1975 /* Skip the script if available */
1976 ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1977 if(scriptID != localeID+1) {
1978 /* Found optional script */
1979 localeID = scriptID;
1980 }
1981 if(_isIDSeparator(*localeID)) {
1982 i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1983 }
1984 }
1985 return u_terminateChars(country, countryCapacity, i, err);
1986 }
1987
1988 U_CAPI int32_t U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1989 uloc_getVariant(const char* localeID,
1990 char* variant,
1991 int32_t variantCapacity,
1992 UErrorCode* err)
1993 {
1994 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1995 const char* tmpLocaleID;
1996 int32_t i=0;
1997
1998 if(err==NULL || U_FAILURE(*err)) {
1999 return 0;
2000 }
2001
2002 if (_hasBCP47Extension(localeID)) {
2003 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
2004 } else {
2005 if (localeID==NULL) {
2006 localeID=uloc_getDefault();
2007 }
2008 tmpLocaleID=localeID;
2009 }
2010
2011 /* Skip the language */
2012 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2013 if(_isIDSeparator(*tmpLocaleID)) {
2014 const char *scriptID;
2015 /* Skip the script if available */
2016 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2017 if(scriptID != tmpLocaleID+1) {
2018 /* Found optional script */
2019 tmpLocaleID = scriptID;
2020 }
2021 /* Skip the Country */
2022 if (_isIDSeparator(*tmpLocaleID)) {
2023 const char *cntryID;
2024 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2025 if (cntryID != tmpLocaleID+1) {
2026 /* Found optional country */
2027 tmpLocaleID = cntryID;
2028 }
2029 if(_isIDSeparator(*tmpLocaleID)) {
2030 /* If there was no country ID, skip a possible extra IDSeparator */
2031 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2032 tmpLocaleID++;
2033 }
2034 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2035 }
2036 }
2037 }
2038
2039 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2040 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2041 /*
2042 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2043 i=_getVariant(localeID+1, '@', variant, variantCapacity);
2044 }
2045 */
2046 return u_terminateChars(variant, variantCapacity, i, err);
2047 }
2048
2049 U_CAPI int32_t U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2050 uloc_getName(const char* localeID,
2051 char* name,
2052 int32_t nameCapacity,
2053 UErrorCode* err)
2054 {
2055 return _canonicalize(localeID, name, nameCapacity, 0, err);
2056 }
2057
2058 U_CAPI int32_t U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2059 uloc_getBaseName(const char* localeID,
2060 char* name,
2061 int32_t nameCapacity,
2062 UErrorCode* err)
2063 {
2064 return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2065 }
2066
2067 U_CAPI int32_t U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2068 uloc_canonicalize(const char* localeID,
2069 char* name,
2070 int32_t nameCapacity,
2071 UErrorCode* err)
2072 {
2073 return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2074 }
2075
2076 U_CAPI const char* U_EXPORT2
uloc_getISO3Language(const char * localeID)2077 uloc_getISO3Language(const char* localeID)
2078 {
2079 int16_t offset;
2080 char lang[ULOC_LANG_CAPACITY];
2081 UErrorCode err = U_ZERO_ERROR;
2082
2083 if (localeID == NULL)
2084 {
2085 localeID = uloc_getDefault();
2086 }
2087 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2088 if (U_FAILURE(err))
2089 return "";
2090 offset = _findIndex(LANGUAGES, lang);
2091 if (offset < 0)
2092 return "";
2093 return LANGUAGES_3[offset];
2094 }
2095
2096 U_CAPI const char* U_EXPORT2
uloc_getISO3Country(const char * localeID)2097 uloc_getISO3Country(const char* localeID)
2098 {
2099 int16_t offset;
2100 char cntry[ULOC_LANG_CAPACITY];
2101 UErrorCode err = U_ZERO_ERROR;
2102
2103 if (localeID == NULL)
2104 {
2105 localeID = uloc_getDefault();
2106 }
2107 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2108 if (U_FAILURE(err))
2109 return "";
2110 offset = _findIndex(COUNTRIES, cntry);
2111 if (offset < 0)
2112 return "";
2113
2114 return COUNTRIES_3[offset];
2115 }
2116
2117 U_CAPI uint32_t U_EXPORT2
uloc_getLCID(const char * localeID)2118 uloc_getLCID(const char* localeID)
2119 {
2120 UErrorCode status = U_ZERO_ERROR;
2121 char langID[ULOC_FULLNAME_CAPACITY];
2122
2123 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2124 if (U_FAILURE(status)) {
2125 return 0;
2126 }
2127
2128 if (uprv_strchr(localeID, '@')) {
2129 // uprv_convertToLCID does not support keywords other than collation.
2130 // Remove all keywords except collation.
2131 int32_t len;
2132 char collVal[ULOC_KEYWORDS_CAPACITY];
2133 char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2134
2135 len = uloc_getKeywordValue(localeID, "collation", collVal,
2136 UPRV_LENGTHOF(collVal) - 1, &status);
2137
2138 if (U_SUCCESS(status) && len > 0) {
2139 collVal[len] = 0;
2140
2141 len = uloc_getBaseName(localeID, tmpLocaleID,
2142 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2143
2144 if (U_SUCCESS(status) && len > 0) {
2145 tmpLocaleID[len] = 0;
2146
2147 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2148 UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2149
2150 if (U_SUCCESS(status) && len > 0) {
2151 tmpLocaleID[len] = 0;
2152 return uprv_convertToLCID(langID, tmpLocaleID, &status);
2153 }
2154 }
2155 }
2156
2157 // fall through - all keywords are simply ignored
2158 status = U_ZERO_ERROR;
2159 }
2160
2161 return uprv_convertToLCID(langID, localeID, &status);
2162 }
2163
2164 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2165 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2166 UErrorCode *status)
2167 {
2168 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2169 }
2170
2171 /* ### Default locale **************************************************/
2172
2173 U_CAPI const char* U_EXPORT2
uloc_getDefault()2174 uloc_getDefault()
2175 {
2176 return locale_get_default();
2177 }
2178
2179 U_CAPI void U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2180 uloc_setDefault(const char* newDefaultLocale,
2181 UErrorCode* err)
2182 {
2183 if (U_FAILURE(*err))
2184 return;
2185 /* the error code isn't currently used for anything by this function*/
2186
2187 /* propagate change to C++ */
2188 locale_set_default(newDefaultLocale);
2189 }
2190
2191 /**
2192 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2193 * to an array of pointers to arrays of char. All of these pointers are owned
2194 * by ICU-- do not delete them, and do not write through them. The array is
2195 * terminated with a null pointer.
2196 */
2197 U_CAPI const char* const* U_EXPORT2
uloc_getISOLanguages()2198 uloc_getISOLanguages()
2199 {
2200 return LANGUAGES;
2201 }
2202
2203 /**
2204 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2205 * pointer to an array of pointers to arrays of char. All of these pointers are
2206 * owned by ICU-- do not delete them, and do not write through them. The array is
2207 * terminated with a null pointer.
2208 */
2209 U_CAPI const char* const* U_EXPORT2
uloc_getISOCountries()2210 uloc_getISOCountries()
2211 {
2212 return COUNTRIES;
2213 }
2214
2215
2216 /* this function to be moved into cstring.c later */
2217 static char gDecimal = 0;
2218
2219 static /* U_CAPI */
2220 double
2221 /* U_EXPORT2 */
_uloc_strtod(const char * start,char ** end)2222 _uloc_strtod(const char *start, char **end) {
2223 char *decimal;
2224 char *myEnd;
2225 char buf[30];
2226 double rv;
2227 if (!gDecimal) {
2228 char rep[5];
2229 /* For machines that decide to change the decimal on you,
2230 and try to be too smart with localization.
2231 This normally should be just a '.'. */
2232 sprintf(rep, "%+1.1f", 1.0);
2233 gDecimal = rep[2];
2234 }
2235
2236 if(gDecimal == '.') {
2237 return uprv_strtod(start, end); /* fall through to OS */
2238 } else {
2239 uprv_strncpy(buf, start, 29);
2240 buf[29]=0;
2241 decimal = uprv_strchr(buf, '.');
2242 if(decimal) {
2243 *decimal = gDecimal;
2244 } else {
2245 return uprv_strtod(start, end); /* no decimal point */
2246 }
2247 rv = uprv_strtod(buf, &myEnd);
2248 if(end) {
2249 *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2250 }
2251 return rv;
2252 }
2253 }
2254
2255 typedef struct {
2256 float q;
2257 int32_t dummy; /* to avoid uninitialized memory copy from qsort */
2258 char locale[ULOC_FULLNAME_CAPACITY+1];
2259 } _acceptLangItem;
2260
2261 static int32_t U_CALLCONV
uloc_acceptLanguageCompare(const void *,const void * a,const void * b)2262 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2263 {
2264 const _acceptLangItem *aa = (const _acceptLangItem*)a;
2265 const _acceptLangItem *bb = (const _acceptLangItem*)b;
2266
2267 int32_t rc = 0;
2268 if(bb->q < aa->q) {
2269 rc = -1; /* A > B */
2270 } else if(bb->q > aa->q) {
2271 rc = 1; /* A < B */
2272 } else {
2273 rc = 0; /* A = B */
2274 }
2275
2276 if(rc==0) {
2277 rc = uprv_stricmp(aa->locale, bb->locale);
2278 }
2279
2280 #if defined(ULOC_DEBUG)
2281 /* fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2282 aa->locale, aa->q,
2283 bb->locale, bb->q,
2284 rc);*/
2285 #endif
2286
2287 return rc;
2288 }
2289
2290 /*
2291 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2292 */
2293
2294 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char * httpAcceptLanguage,UEnumeration * availableLocales,UErrorCode * status)2295 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2296 const char *httpAcceptLanguage,
2297 UEnumeration* availableLocales,
2298 UErrorCode *status)
2299 {
2300 MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2301 char tmp[ULOC_FULLNAME_CAPACITY +1];
2302 int32_t n = 0;
2303 const char *itemEnd;
2304 const char *paramEnd;
2305 const char *s;
2306 const char *t;
2307 int32_t res;
2308 int32_t i;
2309 int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2310
2311 if(U_FAILURE(*status)) {
2312 return -1;
2313 }
2314
2315 for(s=httpAcceptLanguage;s&&*s;) {
2316 while(isspace(*s)) /* eat space at the beginning */
2317 s++;
2318 itemEnd=uprv_strchr(s,',');
2319 paramEnd=uprv_strchr(s,';');
2320 if(!itemEnd) {
2321 itemEnd = httpAcceptLanguage+l; /* end of string */
2322 }
2323 if(paramEnd && paramEnd<itemEnd) {
2324 /* semicolon (;) is closer than end (,) */
2325 t = paramEnd+1;
2326 if(*t=='q') {
2327 t++;
2328 }
2329 while(isspace(*t)) {
2330 t++;
2331 }
2332 if(*t=='=') {
2333 t++;
2334 }
2335 while(isspace(*t)) {
2336 t++;
2337 }
2338 items[n].q = (float)_uloc_strtod(t,NULL);
2339 } else {
2340 /* no semicolon - it's 1.0 */
2341 items[n].q = 1.0f;
2342 paramEnd = itemEnd;
2343 }
2344 items[n].dummy=0;
2345 /* eat spaces prior to semi */
2346 for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2347 ;
2348 int32_t slen = ((t+1)-s);
2349 if(slen > ULOC_FULLNAME_CAPACITY) {
2350 *status = U_BUFFER_OVERFLOW_ERROR;
2351 return -1; // too big
2352 }
2353 uprv_strncpy(items[n].locale, s, slen);
2354 items[n].locale[slen]=0; // terminate
2355 int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2356 if(U_FAILURE(*status)) return -1;
2357 if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2358 // canonicalization had an effect- copy back
2359 uprv_strncpy(items[n].locale, tmp, clen);
2360 items[n].locale[clen] = 0; // terminate
2361 }
2362 #if defined(ULOC_DEBUG)
2363 /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2364 #endif
2365 n++;
2366 s = itemEnd;
2367 while(*s==',') { /* eat duplicate commas */
2368 s++;
2369 }
2370 if(n>=items.getCapacity()) { // If we need more items
2371 if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2372 *status = U_MEMORY_ALLOCATION_ERROR;
2373 return -1;
2374 }
2375 #if defined(ULOC_DEBUG)
2376 fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2377 #endif
2378 }
2379 }
2380 uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2381 if (U_FAILURE(*status)) {
2382 return -1;
2383 }
2384 LocalMemory<const char*> strs(NULL);
2385 if (strs.allocateInsteadAndReset(n) == NULL) {
2386 *status = U_MEMORY_ALLOCATION_ERROR;
2387 return -1;
2388 }
2389 for(i=0;i<n;i++) {
2390 #if defined(ULOC_DEBUG)
2391 /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2392 #endif
2393 strs[i]=items[i].locale;
2394 }
2395 res = uloc_acceptLanguage(result, resultAvailable, outResult,
2396 strs.getAlias(), n, availableLocales, status);
2397 return res;
2398 }
2399
2400
2401 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char ** acceptList,int32_t acceptListCount,UEnumeration * availableLocales,UErrorCode * status)2402 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2403 UAcceptResult *outResult, const char **acceptList,
2404 int32_t acceptListCount,
2405 UEnumeration* availableLocales,
2406 UErrorCode *status)
2407 {
2408 int32_t i,j;
2409 int32_t len;
2410 int32_t maxLen=0;
2411 char tmp[ULOC_FULLNAME_CAPACITY+1];
2412 const char *l;
2413 char **fallbackList;
2414 if(U_FAILURE(*status)) {
2415 return -1;
2416 }
2417 fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2418 if(fallbackList==NULL) {
2419 *status = U_MEMORY_ALLOCATION_ERROR;
2420 return -1;
2421 }
2422 for(i=0;i<acceptListCount;i++) {
2423 #if defined(ULOC_DEBUG)
2424 fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2425 #endif
2426 while((l=uenum_next(availableLocales, NULL, status))) {
2427 #if defined(ULOC_DEBUG)
2428 fprintf(stderr," %s\n", l);
2429 #endif
2430 len = (int32_t)uprv_strlen(l);
2431 if(!uprv_strcmp(acceptList[i], l)) {
2432 if(outResult) {
2433 *outResult = ULOC_ACCEPT_VALID;
2434 }
2435 #if defined(ULOC_DEBUG)
2436 fprintf(stderr, "MATCH! %s\n", l);
2437 #endif
2438 if(len>0) {
2439 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2440 }
2441 for(j=0;j<i;j++) {
2442 uprv_free(fallbackList[j]);
2443 }
2444 uprv_free(fallbackList);
2445 return u_terminateChars(result, resultAvailable, len, status);
2446 }
2447 if(len>maxLen) {
2448 maxLen = len;
2449 }
2450 }
2451 uenum_reset(availableLocales, status);
2452 /* save off parent info */
2453 if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2454 fallbackList[i] = uprv_strdup(tmp);
2455 } else {
2456 fallbackList[i]=0;
2457 }
2458 }
2459
2460 for(maxLen--;maxLen>0;maxLen--) {
2461 for(i=0;i<acceptListCount;i++) {
2462 if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2463 #if defined(ULOC_DEBUG)
2464 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2465 #endif
2466 while((l=uenum_next(availableLocales, NULL, status))) {
2467 #if defined(ULOC_DEBUG)
2468 fprintf(stderr," %s\n", l);
2469 #endif
2470 len = (int32_t)uprv_strlen(l);
2471 if(!uprv_strcmp(fallbackList[i], l)) {
2472 if(outResult) {
2473 *outResult = ULOC_ACCEPT_FALLBACK;
2474 }
2475 #if defined(ULOC_DEBUG)
2476 fprintf(stderr, "fallback MATCH! %s\n", l);
2477 #endif
2478 if(len>0) {
2479 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2480 }
2481 for(j=0;j<acceptListCount;j++) {
2482 uprv_free(fallbackList[j]);
2483 }
2484 uprv_free(fallbackList);
2485 return u_terminateChars(result, resultAvailable, len, status);
2486 }
2487 }
2488 uenum_reset(availableLocales, status);
2489
2490 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2491 uprv_free(fallbackList[i]);
2492 fallbackList[i] = uprv_strdup(tmp);
2493 } else {
2494 uprv_free(fallbackList[i]);
2495 fallbackList[i]=0;
2496 }
2497 }
2498 }
2499 if(outResult) {
2500 *outResult = ULOC_ACCEPT_FAILED;
2501 }
2502 }
2503 for(i=0;i<acceptListCount;i++) {
2504 uprv_free(fallbackList[i]);
2505 }
2506 uprv_free(fallbackList);
2507 return -1;
2508 }
2509
2510 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2511 uloc_toUnicodeLocaleKey(const char* keyword)
2512 {
2513 const char* bcpKey = ulocimp_toBcpKey(keyword);
2514 if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2515 // unknown keyword, but syntax is fine..
2516 return keyword;
2517 }
2518 return bcpKey;
2519 }
2520
2521 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2522 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2523 {
2524 const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2525 if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2526 // unknown keyword, but syntax is fine..
2527 return value;
2528 }
2529 return bcpType;
2530 }
2531
2532 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
2533 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
2534
2535 static UBool
isWellFormedLegacyKey(const char * legacyKey)2536 isWellFormedLegacyKey(const char* legacyKey)
2537 {
2538 const char* p = legacyKey;
2539 while (*p) {
2540 if (!UPRV_ISALPHANUM(*p)) {
2541 return FALSE;
2542 }
2543 p++;
2544 }
2545 return TRUE;
2546 }
2547
2548 static UBool
isWellFormedLegacyType(const char * legacyType)2549 isWellFormedLegacyType(const char* legacyType)
2550 {
2551 const char* p = legacyType;
2552 int32_t alphaNumLen = 0;
2553 while (*p) {
2554 if (*p == '_' || *p == '/' || *p == '-') {
2555 if (alphaNumLen == 0) {
2556 return FALSE;
2557 }
2558 alphaNumLen = 0;
2559 } else if (UPRV_ISALPHANUM(*p)) {
2560 alphaNumLen++;
2561 } else {
2562 return FALSE;
2563 }
2564 p++;
2565 }
2566 return (alphaNumLen != 0);
2567 }
2568
2569 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2570 uloc_toLegacyKey(const char* keyword)
2571 {
2572 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2573 if (legacyKey == NULL) {
2574 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2575 //
2576 // Note:
2577 // Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2578 // However, a key should not contain '=' obviously. For now, all existing
2579 // keys are using ASCII alphabetic letters only. We won't add any new key
2580 // that is not compatible with the BCP 47 syntax. Therefore, we assume
2581 // a valid key consist from [0-9a-zA-Z], no symbols.
2582 if (isWellFormedLegacyKey(keyword)) {
2583 return keyword;
2584 }
2585 }
2586 return legacyKey;
2587 }
2588
2589 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2590 uloc_toLegacyType(const char* keyword, const char* value)
2591 {
2592 const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2593 if (legacyType == NULL) {
2594 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2595 //
2596 // Note:
2597 // Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2598 // However, a type should not contain '=' obviously. For now, all existing
2599 // types are using ASCII alphabetic letters with a few symbol letters. We won't
2600 // add any new type that is not compatible with the BCP 47 syntax except timezone
2601 // IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain
2602 // '-' '_' '/' in the middle.
2603 if (isWellFormedLegacyType(value)) {
2604 return value;
2605 }
2606 }
2607 return legacyType;
2608 }
2609
2610 /*eof*/
2611