1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 04/01/97 aliu Creation.
15 * 08/21/98 stephen JDK 1.2 sync
16 * 12/08/98 rtg New Locale implementation and C API
17 * 03/15/99 damiba overhaul.
18 * 04/06/99 stephen changed setDefault() to realloc and copy
19 * 06/14/99 stephen Changed calls to ures_open for new params
20 * 07/21/99 stephen Modified setDefault() to propagate to C++
21 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22 * brought canonicalization code into line with spec
23 *****************************************************************************/
24
25 /*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31 */
32
33 #include "unicode/utypes.h"
34 #include "unicode/ustring.h"
35 #include "unicode/uloc.h"
36
37 #include "putilimp.h"
38 #include "ustr_imp.h"
39 #include "ulocimp.h"
40 #include "umutex.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43 #include "locmap.h"
44 #include "uarrsort.h"
45 #include "uenumimp.h"
46 #include "uassert.h"
47 #include "charstr.h"
48
49 #include <stdio.h> /* for sprintf */
50
51 U_NAMESPACE_USE
52
53 /* ### Declarations **************************************************/
54
55 /* Locale stuff from locid.cpp */
56 U_CFUNC void locale_set_default(const char *id);
57 U_CFUNC const char *locale_get_default(void);
58 U_CFUNC int32_t
59 locale_getKeywords(const char *localeID,
60 char prev,
61 char *keywords, int32_t keywordCapacity,
62 char *values, int32_t valuesCapacity, int32_t *valLen,
63 UBool valuesToo,
64 UErrorCode *status);
65
66 /* ### Data tables **************************************************/
67
68 /**
69 * Table of language codes, both 2- and 3-letter, with preference
70 * given to 2-letter codes where possible. Includes 3-letter codes
71 * that lack a 2-letter equivalent.
72 *
73 * This list must be in sorted order. This list is returned directly
74 * to the user by some API.
75 *
76 * This list must be kept in sync with LANGUAGES_3, with corresponding
77 * entries matched.
78 *
79 * This table should be terminated with a NULL entry, followed by a
80 * second list, and another NULL entry. The first list is visible to
81 * user code when this array is returned by API. The second list
82 * contains codes we support, but do not expose through user API.
83 *
84 * Notes
85 *
86 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
87 * include the revisions up to 2001/7/27 *CWB*
88 *
89 * The 3 character codes are the terminology codes like RFC 3066. This
90 * is compatible with prior ICU codes
91 *
92 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
93 * table but now at the end of the table because 3 character codes are
94 * duplicates. This avoids bad searches going from 3 to 2 character
95 * codes.
96 *
97 * The range qaa-qtz is reserved for local use
98 */
99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
100 /* ISO639 table version is 20150505 */
101 /* Subsequent hand addition of selected languages */
102 static const char * const LANGUAGES[] = {
103 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
104 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
105 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
106 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
107 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
108 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
109 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
110 "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
111 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
112 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
113 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
114 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
115 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
116 "cs", "csb", "cu", "cv", "cy",
117 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
118 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
119 "dyo", "dyu", "dz", "dzg",
120 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
121 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
122 "ext",
123 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
124 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
125 "frs", "fur", "fy",
126 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
127 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
128 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
129 "gur", "guz", "gv", "gwi",
130 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
131 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
132 "hup", "hy", "hz",
133 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
134 "ilo", "inh", "io", "is", "it", "iu", "izh",
135 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
136 "jv",
137 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
138 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
139 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
140 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
141 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
142 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
143 "kv", "kw", "ky",
144 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
145 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
146 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
147 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
148 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
149 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
150 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
151 "ml", "mn", "mnc", "mni", "moh", "mos", "mr", "mrj",
152 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
153 "my", "mye", "myv", "mzn",
154 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
155 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
156 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
157 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
158 "oc", "oj", "om", "or", "os", "osa", "ota",
159 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
160 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
161 "pon", "prg", "pro", "ps", "pt",
162 "qu", "quc", "qug",
163 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
164 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
165 "rw", "rwk",
166 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
167 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
168 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
169 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
170 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
171 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
172 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
173 "sv", "sw", "swb", "swc", "syc", "syr", "szl",
174 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
175 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
176 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
177 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
178 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
179 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
180 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
181 "vot", "vro", "vun",
182 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
183 "xal", "xh", "xmf", "xog",
184 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
185 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
186 "zun", "zxx", "zza",
187 NULL,
188 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
189 NULL
190 };
191
192 static const char* const DEPRECATED_LANGUAGES[]={
193 "in", "iw", "ji", "jw", NULL, NULL
194 };
195 static const char* const REPLACEMENT_LANGUAGES[]={
196 "id", "he", "yi", "jv", NULL, NULL
197 };
198
199 /**
200 * Table of 3-letter language codes.
201 *
202 * This is a lookup table used to convert 3-letter language codes to
203 * their 2-letter equivalent, where possible. It must be kept in sync
204 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
205 * same language as LANGUAGES_3[i]. The commented-out lines are
206 * copied from LANGUAGES to make eyeballing this baby easier.
207 *
208 * Where a 3-letter language code has no 2-letter equivalent, the
209 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
210 *
211 * This table should be terminated with a NULL entry, followed by a
212 * second list, and another NULL entry. The two lists correspond to
213 * the two lists in LANGUAGES.
214 */
215 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
216 /* ISO639 table version is 20150505 */
217 /* Subsequent hand addition of selected languages */
218 static const char * const LANGUAGES_3[] = {
219 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
220 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
221 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
222 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
223 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
224 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
225 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
226 "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
227 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
228 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
229 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
230 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
231 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
232 "ces", "csb", "chu", "chv", "cym",
233 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
234 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
235 "dyo", "dyu", "dzo", "dzg",
236 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
237 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
238 "ext",
239 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
240 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
241 "frs", "fur", "fry",
242 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
243 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
244 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
245 "gur", "guz", "glv", "gwi",
246 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
247 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
248 "hup", "hye", "her",
249 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
250 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
251 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
252 "jav",
253 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
254 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
255 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
256 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
257 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
258 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
259 "kom", "cor", "kir",
260 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
261 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
262 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
263 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
264 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
265 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
266 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
267 "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
268 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
269 "mya", "mye", "myv", "mzn",
270 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
271 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
272 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
273 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
274 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
275 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
276 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
277 "pon", "prg", "pro", "pus", "por",
278 "que", "quc", "qug",
279 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
280 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
281 "kin", "rwk",
282 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
283 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
284 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
285 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
286 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
287 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
288 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
289 "swe", "swa", "swb", "swc", "syc", "syr", "szl",
290 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
291 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
292 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
293 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
294 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
295 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
296 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
297 "vot", "vro", "vun",
298 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
299 "xal", "xho", "xmf", "xog",
300 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
301 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
302 "zun", "zxx", "zza",
303 NULL,
304 /* "in", "iw", "ji", "jw", "sh", */
305 "ind", "heb", "yid", "jaw", "srp",
306 NULL
307 };
308
309 /**
310 * Table of 2-letter country codes.
311 *
312 * This list must be in sorted order. This list is returned directly
313 * to the user by some API.
314 *
315 * This list must be kept in sync with COUNTRIES_3, with corresponding
316 * entries matched.
317 *
318 * This table should be terminated with a NULL entry, followed by a
319 * second list, and another NULL entry. The first list is visible to
320 * user code when this array is returned by API. The second list
321 * contains codes we support, but do not expose through user API.
322 *
323 * Notes:
324 *
325 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
326 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
327 * new codes keeping the old ones for compatibility updated to include
328 * 1999/12/03 revisions *CWB*
329 *
330 * RO(ROM) is now RO(ROU) according to
331 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
332 */
333 static const char * const COUNTRIES[] = {
334 "AD", "AE", "AF", "AG", "AI", "AL", "AM",
335 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
336 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
337 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
338 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
339 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR",
340 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK",
341 "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER",
342 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
343 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
344 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
345 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
346 "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
347 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
348 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
349 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
350 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
351 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
352 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
353 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
354 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
355 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
356 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
357 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
358 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
359 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
360 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
361 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
362 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
363 "WS", "YE", "YT", "ZA", "ZM", "ZW",
364 NULL,
365 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
366 NULL
367 };
368
369 static const char* const DEPRECATED_COUNTRIES[] = {
370 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
371 };
372 static const char* const REPLACEMENT_COUNTRIES[] = {
373 /* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
374 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL /* replacement country codes */
375 };
376
377 /**
378 * Table of 3-letter country codes.
379 *
380 * This is a lookup table used to convert 3-letter country codes to
381 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
382 * For all valid i, COUNTRIES[i] must refer to the same country as
383 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
384 * to make eyeballing this baby easier.
385 *
386 * This table should be terminated with a NULL entry, followed by a
387 * second list, and another NULL entry. The two lists correspond to
388 * the two lists in COUNTRIES.
389 */
390 static const char * const COUNTRIES_3[] = {
391 /* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
392 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
393 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
394 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
395 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
396 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
397 /* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
398 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
399 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
400 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
401 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR", */
402 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
403 /* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK", */
404 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
405 /* "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", */
406 "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
407 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
408 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
409 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
410 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
411 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
412 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
413 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
414 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
415 /* "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
416 "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
417 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
418 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
419 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
420 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
421 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
422 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
423 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
424 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
425 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
426 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
427 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
428 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
429 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
430 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
431 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
432 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
433 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
434 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
435 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
436 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
437 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
438 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
439 /* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
440 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
441 /* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
442 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
443 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
444 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
445 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
446 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
447 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
448 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
449 /* "WS", "YE", "YT", "ZA", "ZM", "ZW", */
450 "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
451 NULL,
452 /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
453 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
454 NULL
455 };
456
457 typedef struct CanonicalizationMap {
458 const char *id; /* input ID */
459 const char *canonicalID; /* canonicalized output ID */
460 const char *keyword; /* keyword, or NULL if none */
461 const char *value; /* keyword value, or NULL if kw==NULL */
462 } CanonicalizationMap;
463
464 /**
465 * A map to canonicalize locale IDs. This handles a variety of
466 * different semantic kinds of transformations.
467 */
468 static const CanonicalizationMap CANONICALIZE_MAP[] = {
469 { "", "en_US_POSIX", NULL, NULL }, /* .NET name */
470 { "c", "en_US_POSIX", NULL, NULL }, /* POSIX name */
471 { "posix", "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
472 { "art_LOJBAN", "jbo", NULL, NULL }, /* registered name */
473 { "az_AZ_CYRL", "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
474 { "az_AZ_LATN", "az_Latn_AZ", NULL, NULL }, /* .NET name */
475 { "ca_ES_PREEURO", "ca_ES", "currency", "ESP" },
476 { "de__PHONEBOOK", "de", "collation", "phonebook" }, /* Old ICU name */
477 { "de_AT_PREEURO", "de_AT", "currency", "ATS" },
478 { "de_DE_PREEURO", "de_DE", "currency", "DEM" },
479 { "de_LU_PREEURO", "de_LU", "currency", "LUF" },
480 { "el_GR_PREEURO", "el_GR", "currency", "GRD" },
481 { "en_BE_PREEURO", "en_BE", "currency", "BEF" },
482 { "en_IE_PREEURO", "en_IE", "currency", "IEP" },
483 { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
484 { "es_ES_PREEURO", "es_ES", "currency", "ESP" },
485 { "eu_ES_PREEURO", "eu_ES", "currency", "ESP" },
486 { "fi_FI_PREEURO", "fi_FI", "currency", "FIM" },
487 { "fr_BE_PREEURO", "fr_BE", "currency", "BEF" },
488 { "fr_FR_PREEURO", "fr_FR", "currency", "FRF" },
489 { "fr_LU_PREEURO", "fr_LU", "currency", "LUF" },
490 { "ga_IE_PREEURO", "ga_IE", "currency", "IEP" },
491 { "gl_ES_PREEURO", "gl_ES", "currency", "ESP" },
492 { "hi__DIRECT", "hi", "collation", "direct" }, /* Old ICU name */
493 { "it_IT_PREEURO", "it_IT", "currency", "ITL" },
494 { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
495 { "nb_NO_NY", "nn_NO", NULL, NULL }, /* "markus said this was ok" :-) */
496 { "nl_BE_PREEURO", "nl_BE", "currency", "BEF" },
497 { "nl_NL_PREEURO", "nl_NL", "currency", "NLG" },
498 { "pt_PT_PREEURO", "pt_PT", "currency", "PTE" },
499 { "sr_SP_CYRL", "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
500 { "sr_SP_LATN", "sr_Latn_RS", NULL, NULL }, /* .NET name */
501 { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
502 { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
503 { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
504 { "uz_UZ_CYRL", "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
505 { "uz_UZ_LATN", "uz_Latn_UZ", NULL, NULL }, /* .NET name */
506 { "zh_CHS", "zh_Hans", NULL, NULL }, /* .NET name */
507 { "zh_CHT", "zh_Hant", NULL, NULL }, /* .NET name */
508 { "zh_GAN", "gan", NULL, NULL }, /* registered name */
509 { "zh_GUOYU", "zh", NULL, NULL }, /* registered name */
510 { "zh_HAKKA", "hak", NULL, NULL }, /* registered name */
511 { "zh_MIN_NAN", "nan", NULL, NULL }, /* registered name */
512 { "zh_WUU", "wuu", NULL, NULL }, /* registered name */
513 { "zh_XIANG", "hsn", NULL, NULL }, /* registered name */
514 { "zh_YUE", "yue", NULL, NULL }, /* registered name */
515 };
516
517 typedef struct VariantMap {
518 const char *variant; /* input ID */
519 const char *keyword; /* keyword, or NULL if none */
520 const char *value; /* keyword value, or NULL if kw==NULL */
521 } VariantMap;
522
523 static const VariantMap VARIANT_MAP[] = {
524 { "EURO", "currency", "EUR" },
525 { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
526 { "STROKE", "collation", "stroke" } /* Solaris variant */
527 };
528
529 /* ### BCP47 Conversion *******************************************/
530 /* Test if the locale id has BCP47 u extension and does not have '@' */
531 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
532 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
533 #define _ConvertBCP47(finalID, id, buffer, length,err) \
534 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
535 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
536 finalID=id; \
537 if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
538 } else { \
539 finalID=buffer; \
540 }
541 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)542 static int32_t getShortestSubtagLength(const char *localeID) {
543 int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
544 int32_t length = localeIDLength;
545 int32_t tmpLength = 0;
546 int32_t i;
547 UBool reset = TRUE;
548
549 for (i = 0; i < localeIDLength; i++) {
550 if (localeID[i] != '_' && localeID[i] != '-') {
551 if (reset) {
552 tmpLength = 0;
553 reset = FALSE;
554 }
555 tmpLength++;
556 } else {
557 if (tmpLength != 0 && tmpLength < length) {
558 length = tmpLength;
559 }
560 reset = TRUE;
561 }
562 }
563
564 return length;
565 }
566
567 /* ### Keywords **************************************************/
568 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
569 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
570 /* Punctuation/symbols allowed in legacy key values */
571 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
572
573 #define ULOC_KEYWORD_BUFFER_LEN 25
574 #define ULOC_MAX_NO_KEYWORDS 25
575
576 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)577 locale_getKeywordsStart(const char *localeID) {
578 const char *result = NULL;
579 if((result = uprv_strchr(localeID, '@')) != NULL) {
580 return result;
581 }
582 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
583 else {
584 /* We do this because the @ sign is variant, and the @ sign used on one
585 EBCDIC machine won't be compiled the same way on other EBCDIC based
586 machines. */
587 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
588 const uint8_t *charToFind = ebcdicSigns;
589 while(*charToFind) {
590 if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
591 return result;
592 }
593 charToFind++;
594 }
595 }
596 #endif
597 return NULL;
598 }
599
600 /**
601 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
602 * @param keywordName incoming name to be canonicalized
603 * @param status return status (keyword too long)
604 * @return length of the keyword name
605 */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)606 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
607 {
608 int32_t keywordNameLen = 0;
609
610 for (; *keywordName != 0; keywordName++) {
611 if (!UPRV_ISALPHANUM(*keywordName)) {
612 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
613 return 0;
614 }
615 if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
616 buf[keywordNameLen++] = uprv_tolower(*keywordName);
617 } else {
618 /* keyword name too long for internal buffer */
619 *status = U_INTERNAL_PROGRAM_ERROR;
620 return 0;
621 }
622 }
623 if (keywordNameLen == 0) {
624 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
625 return 0;
626 }
627 buf[keywordNameLen] = 0; /* terminate */
628
629 return keywordNameLen;
630 }
631
632 typedef struct {
633 char keyword[ULOC_KEYWORD_BUFFER_LEN];
634 int32_t keywordLen;
635 const char *valueStart;
636 int32_t valueLen;
637 } KeywordStruct;
638
639 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)640 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
641 const char* leftString = ((const KeywordStruct *)left)->keyword;
642 const char* rightString = ((const KeywordStruct *)right)->keyword;
643 return uprv_strcmp(leftString, rightString);
644 }
645
646 /**
647 * Both addKeyword and addValue must already be in canonical form.
648 * Either both addKeyword and addValue are NULL, or neither is NULL.
649 * If they are not NULL they must be zero terminated.
650 * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
651 */
652 static int32_t
_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,const char * addKeyword,const char * addValue,UErrorCode * status)653 _getKeywords(const char *localeID,
654 char prev,
655 char *keywords, int32_t keywordCapacity,
656 char *values, int32_t valuesCapacity, int32_t *valLen,
657 UBool valuesToo,
658 const char* addKeyword,
659 const char* addValue,
660 UErrorCode *status)
661 {
662 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
663
664 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
665 int32_t numKeywords = 0;
666 const char* pos = localeID;
667 const char* equalSign = NULL;
668 const char* semicolon = NULL;
669 int32_t i = 0, j, n;
670 int32_t keywordsLen = 0;
671 int32_t valuesLen = 0;
672
673 if(prev == '@') { /* start of keyword definition */
674 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
675 do {
676 UBool duplicate = FALSE;
677 /* skip leading spaces */
678 while(*pos == ' ') {
679 pos++;
680 }
681 if (!*pos) { /* handle trailing "; " */
682 break;
683 }
684 if(numKeywords == maxKeywords) {
685 *status = U_INTERNAL_PROGRAM_ERROR;
686 return 0;
687 }
688 equalSign = uprv_strchr(pos, '=');
689 semicolon = uprv_strchr(pos, ';');
690 /* lack of '=' [foo@currency] is illegal */
691 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
692 if(!equalSign || (semicolon && semicolon<equalSign)) {
693 *status = U_INVALID_FORMAT_ERROR;
694 return 0;
695 }
696 /* need to normalize both keyword and keyword name */
697 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
698 /* keyword name too long for internal buffer */
699 *status = U_INTERNAL_PROGRAM_ERROR;
700 return 0;
701 }
702 for(i = 0, n = 0; i < equalSign - pos; ++i) {
703 if (pos[i] != ' ') {
704 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
705 }
706 }
707
708 /* zero-length keyword is an error. */
709 if (n == 0) {
710 *status = U_INVALID_FORMAT_ERROR;
711 return 0;
712 }
713
714 keywordList[numKeywords].keyword[n] = 0;
715 keywordList[numKeywords].keywordLen = n;
716 /* now grab the value part. First we skip the '=' */
717 equalSign++;
718 /* then we leading spaces */
719 while(*equalSign == ' ') {
720 equalSign++;
721 }
722
723 /* Premature end or zero-length value */
724 if (!*equalSign || equalSign == semicolon) {
725 *status = U_INVALID_FORMAT_ERROR;
726 return 0;
727 }
728
729 keywordList[numKeywords].valueStart = equalSign;
730
731 pos = semicolon;
732 i = 0;
733 if(pos) {
734 while(*(pos - i - 1) == ' ') {
735 i++;
736 }
737 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
738 pos++;
739 } else {
740 i = (int32_t)uprv_strlen(equalSign);
741 while(i && equalSign[i-1] == ' ') {
742 i--;
743 }
744 keywordList[numKeywords].valueLen = i;
745 }
746 /* If this is a duplicate keyword, then ignore it */
747 for (j=0; j<numKeywords; ++j) {
748 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
749 duplicate = TRUE;
750 break;
751 }
752 }
753 if (!duplicate) {
754 ++numKeywords;
755 }
756 } while(pos);
757
758 /* Handle addKeyword/addValue. */
759 if (addKeyword != NULL) {
760 UBool duplicate = FALSE;
761 U_ASSERT(addValue != NULL);
762 /* Search for duplicate; if found, do nothing. Explicit keyword
763 overrides addKeyword. */
764 for (j=0; j<numKeywords; ++j) {
765 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
766 duplicate = TRUE;
767 break;
768 }
769 }
770 if (!duplicate) {
771 if (numKeywords == maxKeywords) {
772 *status = U_INTERNAL_PROGRAM_ERROR;
773 return 0;
774 }
775 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
776 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
777 keywordList[numKeywords].valueStart = addValue;
778 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
779 ++numKeywords;
780 }
781 } else {
782 U_ASSERT(addValue == NULL);
783 }
784
785 /* now we have a list of keywords */
786 /* we need to sort it */
787 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
788
789 /* Now construct the keyword part */
790 for(i = 0; i < numKeywords; i++) {
791 if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
792 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
793 if(valuesToo) {
794 keywords[keywordsLen + keywordList[i].keywordLen] = '=';
795 } else {
796 keywords[keywordsLen + keywordList[i].keywordLen] = 0;
797 }
798 }
799 keywordsLen += keywordList[i].keywordLen + 1;
800 if(valuesToo) {
801 if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) {
802 uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
803 }
804 keywordsLen += keywordList[i].valueLen;
805
806 if(i < numKeywords - 1) {
807 if(keywordsLen < keywordCapacity) {
808 keywords[keywordsLen] = ';';
809 }
810 keywordsLen++;
811 }
812 }
813 if(values) {
814 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
815 uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
816 values[valuesLen + keywordList[i].valueLen] = 0;
817 }
818 valuesLen += keywordList[i].valueLen + 1;
819 }
820 }
821 if(values) {
822 values[valuesLen] = 0;
823 if(valLen) {
824 *valLen = valuesLen;
825 }
826 }
827 return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
828 } else {
829 return 0;
830 }
831 }
832
833 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)834 locale_getKeywords(const char *localeID,
835 char prev,
836 char *keywords, int32_t keywordCapacity,
837 char *values, int32_t valuesCapacity, int32_t *valLen,
838 UBool valuesToo,
839 UErrorCode *status) {
840 return _getKeywords(localeID, prev, keywords, keywordCapacity,
841 values, valuesCapacity, valLen, valuesToo,
842 NULL, NULL, status);
843 }
844
845 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)846 uloc_getKeywordValue(const char* localeID,
847 const char* keywordName,
848 char* buffer, int32_t bufferCapacity,
849 UErrorCode* status)
850 {
851 const char* startSearchHere = NULL;
852 const char* nextSeparator = NULL;
853 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
854 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
855 int32_t result = 0;
856
857 if(status && U_SUCCESS(*status) && localeID) {
858 char tempBuffer[ULOC_FULLNAME_CAPACITY];
859 const char* tmpLocaleID;
860
861 if (keywordName == NULL || keywordName[0] == 0) {
862 *status = U_ILLEGAL_ARGUMENT_ERROR;
863 return 0;
864 }
865
866 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
867 if(U_FAILURE(*status)) {
868 return 0;
869 }
870
871 if (_hasBCP47Extension(localeID)) {
872 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
873 } else {
874 tmpLocaleID=localeID;
875 }
876
877 startSearchHere = locale_getKeywordsStart(tmpLocaleID);
878 if(startSearchHere == NULL) {
879 /* no keywords, return at once */
880 return 0;
881 }
882
883 /* find the first keyword */
884 while(startSearchHere) {
885 const char* keyValueTail;
886 int32_t keyValueLen;
887
888 startSearchHere++; /* skip @ or ; */
889 nextSeparator = uprv_strchr(startSearchHere, '=');
890 if(!nextSeparator) {
891 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
892 return 0;
893 }
894 /* strip leading & trailing spaces (TC decided to tolerate these) */
895 while(*startSearchHere == ' ') {
896 startSearchHere++;
897 }
898 keyValueTail = nextSeparator;
899 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
900 keyValueTail--;
901 }
902 /* now keyValueTail points to first char after the keyName */
903 /* copy & normalize keyName from locale */
904 if (startSearchHere == keyValueTail) {
905 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
906 return 0;
907 }
908 keyValueLen = 0;
909 while (startSearchHere < keyValueTail) {
910 if (!UPRV_ISALPHANUM(*startSearchHere)) {
911 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
912 return 0;
913 }
914 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
915 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
916 } else {
917 /* keyword name too long for internal buffer */
918 *status = U_INTERNAL_PROGRAM_ERROR;
919 return 0;
920 }
921 }
922 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
923
924 startSearchHere = uprv_strchr(nextSeparator, ';');
925
926 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
927 /* current entry matches the keyword. */
928 nextSeparator++; /* skip '=' */
929 /* First strip leading & trailing spaces (TC decided to tolerate these) */
930 while(*nextSeparator == ' ') {
931 nextSeparator++;
932 }
933 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
934 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
935 keyValueTail--;
936 }
937 /* Now copy the value, but check well-formedness */
938 if (nextSeparator == keyValueTail) {
939 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
940 return 0;
941 }
942 keyValueLen = 0;
943 while (nextSeparator < keyValueTail) {
944 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
945 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
946 return 0;
947 }
948 if (keyValueLen < bufferCapacity) {
949 /* Should we lowercase value to return here? Tests expect as-is. */
950 buffer[keyValueLen++] = *nextSeparator++;
951 } else { /* keep advancing so we return correct length in case of overflow */
952 keyValueLen++;
953 nextSeparator++;
954 }
955 }
956 result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
957 return result;
958 }
959 }
960 }
961 return 0;
962 }
963
964 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)965 uloc_setKeywordValue(const char* keywordName,
966 const char* keywordValue,
967 char* buffer, int32_t bufferCapacity,
968 UErrorCode* status)
969 {
970 /* TODO: sorting. removal. */
971 int32_t keywordNameLen;
972 int32_t keywordValueLen;
973 int32_t bufLen;
974 int32_t needLen = 0;
975 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
976 char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
977 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
978 int32_t rc;
979 char* nextSeparator = NULL;
980 char* nextEqualsign = NULL;
981 char* startSearchHere = NULL;
982 char* keywordStart = NULL;
983 CharString updatedKeysAndValues;
984 int32_t updatedKeysAndValuesLen;
985 UBool handledInputKeyAndValue = FALSE;
986 char keyValuePrefix = '@';
987
988 if(U_FAILURE(*status)) {
989 return -1;
990 }
991 if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
992 *status = U_ILLEGAL_ARGUMENT_ERROR;
993 return 0;
994 }
995 bufLen = (int32_t)uprv_strlen(buffer);
996 if(bufferCapacity<bufLen) {
997 /* The capacity is less than the length?! Is this NULL terminated? */
998 *status = U_ILLEGAL_ARGUMENT_ERROR;
999 return 0;
1000 }
1001 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
1002 if(U_FAILURE(*status)) {
1003 return 0;
1004 }
1005
1006 keywordValueLen = 0;
1007 if(keywordValue) {
1008 while (*keywordValue != 0) {
1009 if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
1010 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
1011 return 0;
1012 }
1013 if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
1014 /* Should we force lowercase in value to set? */
1015 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
1016 } else {
1017 /* keywordValue too long for internal buffer */
1018 *status = U_INTERNAL_PROGRAM_ERROR;
1019 return 0;
1020 }
1021 }
1022 }
1023 keywordValueBuffer[keywordValueLen] = 0; /* terminate */
1024
1025 startSearchHere = (char*)locale_getKeywordsStart(buffer);
1026 if(startSearchHere == NULL || (startSearchHere[1]==0)) {
1027 if(keywordValueLen == 0) { /* no keywords = nothing to remove */
1028 return bufLen;
1029 }
1030
1031 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1032 if(startSearchHere) { /* had a single @ */
1033 needLen--; /* already had the @ */
1034 /* startSearchHere points at the @ */
1035 } else {
1036 startSearchHere=buffer+bufLen;
1037 }
1038 if(needLen >= bufferCapacity) {
1039 *status = U_BUFFER_OVERFLOW_ERROR;
1040 return needLen; /* no change */
1041 }
1042 *startSearchHere++ = '@';
1043 uprv_strcpy(startSearchHere, keywordNameBuffer);
1044 startSearchHere += keywordNameLen;
1045 *startSearchHere++ = '=';
1046 uprv_strcpy(startSearchHere, keywordValueBuffer);
1047 return needLen;
1048 } /* end shortcut - no @ */
1049
1050 keywordStart = startSearchHere;
1051 /* search for keyword */
1052 while(keywordStart) {
1053 const char* keyValueTail;
1054 int32_t keyValueLen;
1055
1056 keywordStart++; /* skip @ or ; */
1057 nextEqualsign = uprv_strchr(keywordStart, '=');
1058 if (!nextEqualsign) {
1059 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
1060 return 0;
1061 }
1062 /* strip leading & trailing spaces (TC decided to tolerate these) */
1063 while(*keywordStart == ' ') {
1064 keywordStart++;
1065 }
1066 keyValueTail = nextEqualsign;
1067 while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
1068 keyValueTail--;
1069 }
1070 /* now keyValueTail points to first char after the keyName */
1071 /* copy & normalize keyName from locale */
1072 if (keywordStart == keyValueTail) {
1073 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1074 return 0;
1075 }
1076 keyValueLen = 0;
1077 while (keywordStart < keyValueTail) {
1078 if (!UPRV_ISALPHANUM(*keywordStart)) {
1079 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1080 return 0;
1081 }
1082 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
1083 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
1084 } else {
1085 /* keyword name too long for internal buffer */
1086 *status = U_INTERNAL_PROGRAM_ERROR;
1087 return 0;
1088 }
1089 }
1090 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1091
1092 nextSeparator = uprv_strchr(nextEqualsign, ';');
1093
1094 /* start processing the value part */
1095 nextEqualsign++; /* skip '=' */
1096 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1097 while(*nextEqualsign == ' ') {
1098 nextEqualsign++;
1099 }
1100 keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1101 while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1102 keyValueTail--;
1103 }
1104 if (nextEqualsign == keyValueTail) {
1105 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1106 return 0;
1107 }
1108
1109 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1110 if(rc == 0) {
1111 /* Current entry matches the input keyword. Update the entry */
1112 if(keywordValueLen > 0) { /* updating a value */
1113 updatedKeysAndValues.append(keyValuePrefix, *status);
1114 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1115 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1116 updatedKeysAndValues.append('=', *status);
1117 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1118 } /* else removing this entry, don't emit anything */
1119 handledInputKeyAndValue = TRUE;
1120 } else {
1121 /* input keyword sorts earlier than current entry, add before current entry */
1122 if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1123 /* insert new entry at this location */
1124 updatedKeysAndValues.append(keyValuePrefix, *status);
1125 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1126 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1127 updatedKeysAndValues.append('=', *status);
1128 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1129 handledInputKeyAndValue = TRUE;
1130 }
1131 /* copy the current entry */
1132 updatedKeysAndValues.append(keyValuePrefix, *status);
1133 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1134 updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1135 updatedKeysAndValues.append('=', *status);
1136 updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1137 }
1138 if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1139 /* append new entry at the end, it sorts later than existing entries */
1140 updatedKeysAndValues.append(keyValuePrefix, *status);
1141 /* skip keyValuePrefix update, no subsequent key-value pair */
1142 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1143 updatedKeysAndValues.append('=', *status);
1144 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1145 handledInputKeyAndValue = TRUE;
1146 }
1147 keywordStart = nextSeparator;
1148 } /* end loop searching */
1149
1150 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1151 * problems with the passed-in locale. So if we did encounter problems with the
1152 * passed-in locale above, those errors took precedence and overrode any error
1153 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1154 * are errors here they are from updatedKeysAndValues.append; they do cause an
1155 * error return but the passed-in locale is unmodified and the original bufLen is
1156 * returned.
1157 */
1158 if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1159 /* if input key/value specified removal of a keyword not present in locale, or
1160 * there was an error in CharString.append, leave original locale alone. */
1161 return bufLen;
1162 }
1163
1164 updatedKeysAndValuesLen = updatedKeysAndValues.length();
1165 /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1166 needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1167 if(needLen >= bufferCapacity) {
1168 *status = U_BUFFER_OVERFLOW_ERROR;
1169 return needLen; /* no change */
1170 }
1171 if (updatedKeysAndValuesLen > 0) {
1172 uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1173 }
1174 buffer[needLen]=0;
1175 return needLen;
1176 }
1177
1178 /* ### ID parsing implementation **************************************************/
1179
1180 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1181
1182 /*returns TRUE if one of the special prefixes is here (s=string)
1183 'x-' or 'i-' */
1184 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1185
1186 /* Dot terminates it because of POSIX form where dot precedes the codepage
1187 * except for variant
1188 */
1189 #define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1190
_strnchr(const char * str,int32_t len,char c)1191 static char* _strnchr(const char* str, int32_t len, char c) {
1192 U_ASSERT(str != 0 && len >= 0);
1193 while (len-- != 0) {
1194 char d = *str;
1195 if (d == c) {
1196 return (char*) str;
1197 } else if (d == 0) {
1198 break;
1199 }
1200 ++str;
1201 }
1202 return NULL;
1203 }
1204
1205 /**
1206 * Lookup 'key' in the array 'list'. The array 'list' should contain
1207 * a NULL entry, followed by more entries, and a second NULL entry.
1208 *
1209 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1210 * COUNTRIES_3.
1211 */
_findIndex(const char * const * list,const char * key)1212 static int16_t _findIndex(const char* const* list, const char* key)
1213 {
1214 const char* const* anchor = list;
1215 int32_t pass = 0;
1216
1217 /* Make two passes through two NULL-terminated arrays at 'list' */
1218 while (pass++ < 2) {
1219 while (*list) {
1220 if (uprv_strcmp(key, *list) == 0) {
1221 return (int16_t)(list - anchor);
1222 }
1223 list++;
1224 }
1225 ++list; /* skip final NULL *CWB*/
1226 }
1227 return -1;
1228 }
1229
1230 /* count the length of src while copying it to dest; return strlen(src) */
1231 static inline int32_t
_copyCount(char * dest,int32_t destCapacity,const char * src)1232 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1233 const char *anchor;
1234 char c;
1235
1236 anchor=src;
1237 for(;;) {
1238 if((c=*src)==0) {
1239 return (int32_t)(src-anchor);
1240 }
1241 if(destCapacity<=0) {
1242 return (int32_t)((src-anchor)+uprv_strlen(src));
1243 }
1244 ++src;
1245 *dest++=c;
1246 --destCapacity;
1247 }
1248 }
1249
1250 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1251 uloc_getCurrentCountryID(const char* oldID){
1252 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1253 if (offset >= 0) {
1254 return REPLACEMENT_COUNTRIES[offset];
1255 }
1256 return oldID;
1257 }
1258 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1259 uloc_getCurrentLanguageID(const char* oldID){
1260 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1261 if (offset >= 0) {
1262 return REPLACEMENT_LANGUAGES[offset];
1263 }
1264 return oldID;
1265 }
1266 /*
1267 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1268 * avoid duplicating code to handle the earlier locale ID pieces
1269 * in the functions for the later ones by
1270 * setting the *pEnd pointer to where they stopped parsing
1271 *
1272 * TODO try to use this in Locale
1273 */
1274 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1275 ulocimp_getLanguage(const char *localeID,
1276 char *language, int32_t languageCapacity,
1277 const char **pEnd) {
1278 int32_t i=0;
1279 int32_t offset;
1280 char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1281
1282 /* if it starts with i- or x- then copy that prefix */
1283 if(_isIDPrefix(localeID)) {
1284 if(i<languageCapacity) {
1285 language[i]=(char)uprv_tolower(*localeID);
1286 }
1287 if(i<languageCapacity) {
1288 language[i+1]='-';
1289 }
1290 i+=2;
1291 localeID+=2;
1292 }
1293
1294 /* copy the language as far as possible and count its length */
1295 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1296 if(i<languageCapacity) {
1297 language[i]=(char)uprv_tolower(*localeID);
1298 }
1299 if(i<3) {
1300 U_ASSERT(i>=0);
1301 lang[i]=(char)uprv_tolower(*localeID);
1302 }
1303 i++;
1304 localeID++;
1305 }
1306
1307 if(i==3) {
1308 /* convert 3 character code to 2 character code if possible *CWB*/
1309 offset=_findIndex(LANGUAGES_3, lang);
1310 if(offset>=0) {
1311 i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1312 }
1313 }
1314
1315 if(pEnd!=NULL) {
1316 *pEnd=localeID;
1317 }
1318 return i;
1319 }
1320
1321 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1322 ulocimp_getScript(const char *localeID,
1323 char *script, int32_t scriptCapacity,
1324 const char **pEnd)
1325 {
1326 int32_t idLen = 0;
1327
1328 if (pEnd != NULL) {
1329 *pEnd = localeID;
1330 }
1331
1332 /* copy the second item as far as possible and count its length */
1333 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1334 && uprv_isASCIILetter(localeID[idLen])) {
1335 idLen++;
1336 }
1337
1338 /* If it's exactly 4 characters long, then it's a script and not a country. */
1339 if (idLen == 4) {
1340 int32_t i;
1341 if (pEnd != NULL) {
1342 *pEnd = localeID+idLen;
1343 }
1344 if(idLen > scriptCapacity) {
1345 idLen = scriptCapacity;
1346 }
1347 if (idLen >= 1) {
1348 script[0]=(char)uprv_toupper(*(localeID++));
1349 }
1350 for (i = 1; i < idLen; i++) {
1351 script[i]=(char)uprv_tolower(*(localeID++));
1352 }
1353 }
1354 else {
1355 idLen = 0;
1356 }
1357 return idLen;
1358 }
1359
1360 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1361 ulocimp_getCountry(const char *localeID,
1362 char *country, int32_t countryCapacity,
1363 const char **pEnd)
1364 {
1365 int32_t idLen=0;
1366 char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1367 int32_t offset;
1368
1369 /* copy the country as far as possible and count its length */
1370 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1371 if(idLen<(ULOC_COUNTRY_CAPACITY-1)) { /*CWB*/
1372 cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1373 }
1374 idLen++;
1375 }
1376
1377 /* the country should be either length 2 or 3 */
1378 if (idLen == 2 || idLen == 3) {
1379 UBool gotCountry = FALSE;
1380 /* convert 3 character code to 2 character code if possible *CWB*/
1381 if(idLen==3) {
1382 offset=_findIndex(COUNTRIES_3, cnty);
1383 if(offset>=0) {
1384 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1385 gotCountry = TRUE;
1386 }
1387 }
1388 if (!gotCountry) {
1389 int32_t i = 0;
1390 for (i = 0; i < idLen; i++) {
1391 if (i < countryCapacity) {
1392 country[i]=(char)uprv_toupper(localeID[i]);
1393 }
1394 }
1395 }
1396 localeID+=idLen;
1397 } else {
1398 idLen = 0;
1399 }
1400
1401 if(pEnd!=NULL) {
1402 *pEnd=localeID;
1403 }
1404
1405 return idLen;
1406 }
1407
1408 /**
1409 * @param needSeparator if true, then add leading '_' if any variants
1410 * are added to 'variant'
1411 */
1412 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1413 _getVariantEx(const char *localeID,
1414 char prev,
1415 char *variant, int32_t variantCapacity,
1416 UBool needSeparator) {
1417 int32_t i=0;
1418
1419 /* get one or more variant tags and separate them with '_' */
1420 if(_isIDSeparator(prev)) {
1421 /* get a variant string after a '-' or '_' */
1422 while(!_isTerminator(*localeID)) {
1423 if (needSeparator) {
1424 if (i<variantCapacity) {
1425 variant[i] = '_';
1426 }
1427 ++i;
1428 needSeparator = FALSE;
1429 }
1430 if(i<variantCapacity) {
1431 variant[i]=(char)uprv_toupper(*localeID);
1432 if(variant[i]=='-') {
1433 variant[i]='_';
1434 }
1435 }
1436 i++;
1437 localeID++;
1438 }
1439 }
1440
1441 /* if there is no variant tag after a '-' or '_' then look for '@' */
1442 if(i==0) {
1443 if(prev=='@') {
1444 /* keep localeID */
1445 } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1446 ++localeID; /* point after the '@' */
1447 } else {
1448 return 0;
1449 }
1450 while(!_isTerminator(*localeID)) {
1451 if (needSeparator) {
1452 if (i<variantCapacity) {
1453 variant[i] = '_';
1454 }
1455 ++i;
1456 needSeparator = FALSE;
1457 }
1458 if(i<variantCapacity) {
1459 variant[i]=(char)uprv_toupper(*localeID);
1460 if(variant[i]=='-' || variant[i]==',') {
1461 variant[i]='_';
1462 }
1463 }
1464 i++;
1465 localeID++;
1466 }
1467 }
1468
1469 return i;
1470 }
1471
1472 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1473 _getVariant(const char *localeID,
1474 char prev,
1475 char *variant, int32_t variantCapacity) {
1476 return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1477 }
1478
1479 /**
1480 * Delete ALL instances of a variant from the given list of one or
1481 * more variants. Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1482 * @param variants the source string of one or more variants,
1483 * separated by '_'. This will be MODIFIED IN PLACE. Not zero
1484 * terminated; if it is, trailing zero will NOT be maintained.
1485 * @param variantsLen length of variants
1486 * @param toDelete variant to delete, without separators, e.g. "EURO"
1487 * or "PREEURO"; not zero terminated
1488 * @param toDeleteLen length of toDelete
1489 * @return number of characters deleted from variants
1490 */
1491 static int32_t
_deleteVariant(char * variants,int32_t variantsLen,const char * toDelete,int32_t toDeleteLen)1492 _deleteVariant(char* variants, int32_t variantsLen,
1493 const char* toDelete, int32_t toDeleteLen)
1494 {
1495 int32_t delta = 0; /* number of chars deleted */
1496 for (;;) {
1497 UBool flag = FALSE;
1498 if (variantsLen < toDeleteLen) {
1499 return delta;
1500 }
1501 if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1502 (variantsLen == toDeleteLen ||
1503 (flag=(variants[toDeleteLen] == '_')) != 0))
1504 {
1505 int32_t d = toDeleteLen + (flag?1:0);
1506 variantsLen -= d;
1507 delta += d;
1508 if (variantsLen > 0) {
1509 uprv_memmove(variants, variants+d, variantsLen);
1510 }
1511 } else {
1512 char* p = _strnchr(variants, variantsLen, '_');
1513 if (p == NULL) {
1514 return delta;
1515 }
1516 ++p;
1517 variantsLen -= (int32_t)(p - variants);
1518 variants = p;
1519 }
1520 }
1521 }
1522
1523 /* Keyword enumeration */
1524
1525 typedef struct UKeywordsContext {
1526 char* keywords;
1527 char* current;
1528 } UKeywordsContext;
1529
1530 U_CDECL_BEGIN
1531
1532 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1533 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1534 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1535 uprv_free(enumerator->context);
1536 uprv_free(enumerator);
1537 }
1538
1539 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1540 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1541 char *kw = ((UKeywordsContext *)en->context)->keywords;
1542 int32_t result = 0;
1543 while(*kw) {
1544 result++;
1545 kw += uprv_strlen(kw)+1;
1546 }
1547 return result;
1548 }
1549
1550 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1551 uloc_kw_nextKeyword(UEnumeration* en,
1552 int32_t* resultLength,
1553 UErrorCode* /*status*/) {
1554 const char* result = ((UKeywordsContext *)en->context)->current;
1555 int32_t len = 0;
1556 if(*result) {
1557 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1558 ((UKeywordsContext *)en->context)->current += len+1;
1559 } else {
1560 result = NULL;
1561 }
1562 if (resultLength) {
1563 *resultLength = len;
1564 }
1565 return result;
1566 }
1567
1568 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1569 uloc_kw_resetKeywords(UEnumeration* en,
1570 UErrorCode* /*status*/) {
1571 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1572 }
1573
1574 U_CDECL_END
1575
1576
1577 static const UEnumeration gKeywordsEnum = {
1578 NULL,
1579 NULL,
1580 uloc_kw_closeKeywords,
1581 uloc_kw_countKeywords,
1582 uenum_unextDefault,
1583 uloc_kw_nextKeyword,
1584 uloc_kw_resetKeywords
1585 };
1586
1587 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1588 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1589 {
1590 UKeywordsContext *myContext = NULL;
1591 UEnumeration *result = NULL;
1592
1593 if(U_FAILURE(*status)) {
1594 return NULL;
1595 }
1596 result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1597 /* Null pointer test */
1598 if (result == NULL) {
1599 *status = U_MEMORY_ALLOCATION_ERROR;
1600 return NULL;
1601 }
1602 uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1603 myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1604 if (myContext == NULL) {
1605 *status = U_MEMORY_ALLOCATION_ERROR;
1606 uprv_free(result);
1607 return NULL;
1608 }
1609 myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1610 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1611 myContext->keywords[keywordListSize] = 0;
1612 myContext->current = myContext->keywords;
1613 result->context = myContext;
1614 return result;
1615 }
1616
1617 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1618 uloc_openKeywords(const char* localeID,
1619 UErrorCode* status)
1620 {
1621 int32_t i=0;
1622 char keywords[256];
1623 int32_t keywordsCapacity = 256;
1624 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1625 const char* tmpLocaleID;
1626
1627 if(status==NULL || U_FAILURE(*status)) {
1628 return 0;
1629 }
1630
1631 if (_hasBCP47Extension(localeID)) {
1632 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1633 } else {
1634 if (localeID==NULL) {
1635 localeID=uloc_getDefault();
1636 }
1637 tmpLocaleID=localeID;
1638 }
1639
1640 /* Skip the language */
1641 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1642 if(_isIDSeparator(*tmpLocaleID)) {
1643 const char *scriptID;
1644 /* Skip the script if available */
1645 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1646 if(scriptID != tmpLocaleID+1) {
1647 /* Found optional script */
1648 tmpLocaleID = scriptID;
1649 }
1650 /* Skip the Country */
1651 if (_isIDSeparator(*tmpLocaleID)) {
1652 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1653 if(_isIDSeparator(*tmpLocaleID)) {
1654 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1655 }
1656 }
1657 }
1658
1659 /* keywords are located after '@' */
1660 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1661 i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1662 }
1663
1664 if(i) {
1665 return uloc_openKeywordList(keywords, i, status);
1666 } else {
1667 return NULL;
1668 }
1669 }
1670
1671
1672 /* bit-flags for 'options' parameter of _canonicalize */
1673 #define _ULOC_STRIP_KEYWORDS 0x2
1674 #define _ULOC_CANONICALIZE 0x1
1675
1676 #define OPTION_SET(options, mask) ((options & mask) != 0)
1677
1678 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1679 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1680
1681 /**
1682 * Canonicalize the given localeID, to level 1 or to level 2,
1683 * depending on the options. To specify level 1, pass in options=0.
1684 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1685 *
1686 * This is the code underlying uloc_getName and uloc_canonicalize.
1687 */
1688 static int32_t
_canonicalize(const char * localeID,char * result,int32_t resultCapacity,uint32_t options,UErrorCode * err)1689 _canonicalize(const char* localeID,
1690 char* result,
1691 int32_t resultCapacity,
1692 uint32_t options,
1693 UErrorCode* err) {
1694 int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1695 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1696 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1697 const char* origLocaleID;
1698 const char* tmpLocaleID;
1699 const char* keywordAssign = NULL;
1700 const char* separatorIndicator = NULL;
1701 const char* addKeyword = NULL;
1702 const char* addValue = NULL;
1703 char* name;
1704 char* variant = NULL; /* pointer into name, or NULL */
1705
1706 if (U_FAILURE(*err)) {
1707 return 0;
1708 }
1709
1710 if (_hasBCP47Extension(localeID)) {
1711 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1712 } else {
1713 if (localeID==NULL) {
1714 localeID=uloc_getDefault();
1715 }
1716 tmpLocaleID=localeID;
1717 }
1718
1719 origLocaleID=tmpLocaleID;
1720
1721 /* if we are doing a full canonicalization, then put results in
1722 localeBuffer, if necessary; otherwise send them to result. */
1723 if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1724 (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1725 name = localeBuffer;
1726 nameCapacity = (int32_t)sizeof(localeBuffer);
1727 } else {
1728 name = result;
1729 nameCapacity = resultCapacity;
1730 }
1731
1732 /* get all pieces, one after another, and separate with '_' */
1733 len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1734
1735 if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1736 const char *d = uloc_getDefault();
1737
1738 len = (int32_t)uprv_strlen(d);
1739
1740 if (name != NULL) {
1741 uprv_strncpy(name, d, len);
1742 }
1743 } else if(_isIDSeparator(*tmpLocaleID)) {
1744 const char *scriptID;
1745
1746 ++fieldCount;
1747 if(len<nameCapacity) {
1748 name[len]='_';
1749 }
1750 ++len;
1751
1752 scriptSize=ulocimp_getScript(tmpLocaleID+1,
1753 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1754 if(scriptSize > 0) {
1755 /* Found optional script */
1756 tmpLocaleID = scriptID;
1757 ++fieldCount;
1758 len+=scriptSize;
1759 if (_isIDSeparator(*tmpLocaleID)) {
1760 /* If there is something else, then we add the _ */
1761 if(len<nameCapacity) {
1762 name[len]='_';
1763 }
1764 ++len;
1765 }
1766 }
1767
1768 if (_isIDSeparator(*tmpLocaleID)) {
1769 const char *cntryID;
1770 int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1771 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1772 if (cntrySize > 0) {
1773 /* Found optional country */
1774 tmpLocaleID = cntryID;
1775 len+=cntrySize;
1776 }
1777 if(_isIDSeparator(*tmpLocaleID)) {
1778 /* If there is something else, then we add the _ if we found country before. */
1779 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1780 ++fieldCount;
1781 if(len<nameCapacity) {
1782 name[len]='_';
1783 }
1784 ++len;
1785 }
1786
1787 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1788 (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1789 if (variantSize > 0) {
1790 variant = len<nameCapacity ? name+len : NULL;
1791 len += variantSize;
1792 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1793 }
1794 }
1795 }
1796 }
1797
1798 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1799 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1800 UBool done = FALSE;
1801 do {
1802 char c = *tmpLocaleID;
1803 switch (c) {
1804 case 0:
1805 case '@':
1806 done = TRUE;
1807 break;
1808 default:
1809 if (len<nameCapacity) {
1810 name[len] = c;
1811 }
1812 ++len;
1813 ++tmpLocaleID;
1814 break;
1815 }
1816 } while (!done);
1817 }
1818
1819 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1820 After this, tmpLocaleID either points to '@' or is NULL */
1821 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1822 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1823 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1824 }
1825
1826 /* Copy POSIX-style variant, if any [mr@FOO] */
1827 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1828 tmpLocaleID != NULL && keywordAssign == NULL) {
1829 for (;;) {
1830 char c = *tmpLocaleID;
1831 if (c == 0) {
1832 break;
1833 }
1834 if (len<nameCapacity) {
1835 name[len] = c;
1836 }
1837 ++len;
1838 ++tmpLocaleID;
1839 }
1840 }
1841
1842 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1843 /* Handle @FOO variant if @ is present and not followed by = */
1844 if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1845 int32_t posixVariantSize;
1846 /* Add missing '_' if needed */
1847 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1848 do {
1849 if(len<nameCapacity) {
1850 name[len]='_';
1851 }
1852 ++len;
1853 ++fieldCount;
1854 } while(fieldCount<2);
1855 }
1856 posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1857 (UBool)(variantSize > 0));
1858 if (posixVariantSize > 0) {
1859 if (variant == NULL) {
1860 variant = name+len;
1861 }
1862 len += posixVariantSize;
1863 variantSize += posixVariantSize;
1864 }
1865 }
1866
1867 /* Handle generic variants first */
1868 if (variant) {
1869 for (j=0; j<UPRV_LENGTHOF(VARIANT_MAP); j++) {
1870 const char* variantToCompare = VARIANT_MAP[j].variant;
1871 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1872 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1873 len -= variantLen;
1874 if (variantLen > 0) {
1875 if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1876 --len;
1877 }
1878 addKeyword = VARIANT_MAP[j].keyword;
1879 addValue = VARIANT_MAP[j].value;
1880 break;
1881 }
1882 }
1883 if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1884 --len;
1885 }
1886 }
1887
1888 /* Look up the ID in the canonicalization map */
1889 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1890 const char* id = CANONICALIZE_MAP[j].id;
1891 int32_t n = (int32_t)uprv_strlen(id);
1892 if (len == n && uprv_strncmp(name, id, n) == 0) {
1893 if (n == 0 && tmpLocaleID != NULL) {
1894 break; /* Don't remap "" if keywords present */
1895 }
1896 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1897 if (CANONICALIZE_MAP[j].keyword) {
1898 addKeyword = CANONICALIZE_MAP[j].keyword;
1899 addValue = CANONICALIZE_MAP[j].value;
1900 }
1901 break;
1902 }
1903 }
1904 }
1905
1906 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1907 if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1908 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1909 if(len<nameCapacity) {
1910 name[len]='@';
1911 }
1912 ++len;
1913 ++fieldCount;
1914 len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1915 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1916 } else if (addKeyword != NULL) {
1917 U_ASSERT(addValue != NULL && len < nameCapacity);
1918 /* inelegant but works -- later make _getKeywords do this? */
1919 len += _copyCount(name+len, nameCapacity-len, "@");
1920 len += _copyCount(name+len, nameCapacity-len, addKeyword);
1921 len += _copyCount(name+len, nameCapacity-len, "=");
1922 len += _copyCount(name+len, nameCapacity-len, addValue);
1923 }
1924 }
1925
1926 if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1927 uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1928 }
1929
1930 return u_terminateChars(result, resultCapacity, len, err);
1931 }
1932
1933 /* ### ID parsing API **************************************************/
1934
1935 U_CAPI int32_t U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1936 uloc_getParent(const char* localeID,
1937 char* parent,
1938 int32_t parentCapacity,
1939 UErrorCode* err)
1940 {
1941 const char *lastUnderscore;
1942 int32_t i;
1943
1944 if (U_FAILURE(*err))
1945 return 0;
1946
1947 if (localeID == NULL)
1948 localeID = uloc_getDefault();
1949
1950 lastUnderscore=uprv_strrchr(localeID, '_');
1951 if(lastUnderscore!=NULL) {
1952 i=(int32_t)(lastUnderscore-localeID);
1953 } else {
1954 i=0;
1955 }
1956
1957 if(i>0 && parent != localeID) {
1958 uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1959 }
1960 return u_terminateChars(parent, parentCapacity, i, err);
1961 }
1962
1963 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1964 uloc_getLanguage(const char* localeID,
1965 char* language,
1966 int32_t languageCapacity,
1967 UErrorCode* err)
1968 {
1969 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1970 int32_t i=0;
1971
1972 if (err==NULL || U_FAILURE(*err)) {
1973 return 0;
1974 }
1975
1976 if(localeID==NULL) {
1977 localeID=uloc_getDefault();
1978 }
1979
1980 i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1981 return u_terminateChars(language, languageCapacity, i, err);
1982 }
1983
1984 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1985 uloc_getScript(const char* localeID,
1986 char* script,
1987 int32_t scriptCapacity,
1988 UErrorCode* err)
1989 {
1990 int32_t i=0;
1991
1992 if(err==NULL || U_FAILURE(*err)) {
1993 return 0;
1994 }
1995
1996 if(localeID==NULL) {
1997 localeID=uloc_getDefault();
1998 }
1999
2000 /* skip the language */
2001 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
2002 if(_isIDSeparator(*localeID)) {
2003 i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
2004 }
2005 return u_terminateChars(script, scriptCapacity, i, err);
2006 }
2007
2008 U_CAPI int32_t U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)2009 uloc_getCountry(const char* localeID,
2010 char* country,
2011 int32_t countryCapacity,
2012 UErrorCode* err)
2013 {
2014 int32_t i=0;
2015
2016 if(err==NULL || U_FAILURE(*err)) {
2017 return 0;
2018 }
2019
2020 if(localeID==NULL) {
2021 localeID=uloc_getDefault();
2022 }
2023
2024 /* Skip the language */
2025 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
2026 if(_isIDSeparator(*localeID)) {
2027 const char *scriptID;
2028 /* Skip the script if available */
2029 ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
2030 if(scriptID != localeID+1) {
2031 /* Found optional script */
2032 localeID = scriptID;
2033 }
2034 if(_isIDSeparator(*localeID)) {
2035 i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
2036 }
2037 }
2038 return u_terminateChars(country, countryCapacity, i, err);
2039 }
2040
2041 U_CAPI int32_t U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)2042 uloc_getVariant(const char* localeID,
2043 char* variant,
2044 int32_t variantCapacity,
2045 UErrorCode* err)
2046 {
2047 char tempBuffer[ULOC_FULLNAME_CAPACITY];
2048 const char* tmpLocaleID;
2049 int32_t i=0;
2050
2051 if(err==NULL || U_FAILURE(*err)) {
2052 return 0;
2053 }
2054
2055 if (_hasBCP47Extension(localeID)) {
2056 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
2057 } else {
2058 if (localeID==NULL) {
2059 localeID=uloc_getDefault();
2060 }
2061 tmpLocaleID=localeID;
2062 }
2063
2064 /* Skip the language */
2065 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2066 if(_isIDSeparator(*tmpLocaleID)) {
2067 const char *scriptID;
2068 /* Skip the script if available */
2069 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2070 if(scriptID != tmpLocaleID+1) {
2071 /* Found optional script */
2072 tmpLocaleID = scriptID;
2073 }
2074 /* Skip the Country */
2075 if (_isIDSeparator(*tmpLocaleID)) {
2076 const char *cntryID;
2077 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2078 if (cntryID != tmpLocaleID+1) {
2079 /* Found optional country */
2080 tmpLocaleID = cntryID;
2081 }
2082 if(_isIDSeparator(*tmpLocaleID)) {
2083 /* If there was no country ID, skip a possible extra IDSeparator */
2084 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2085 tmpLocaleID++;
2086 }
2087 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2088 }
2089 }
2090 }
2091
2092 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2093 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2094 /*
2095 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2096 i=_getVariant(localeID+1, '@', variant, variantCapacity);
2097 }
2098 */
2099 return u_terminateChars(variant, variantCapacity, i, err);
2100 }
2101
2102 U_CAPI int32_t U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2103 uloc_getName(const char* localeID,
2104 char* name,
2105 int32_t nameCapacity,
2106 UErrorCode* err)
2107 {
2108 return _canonicalize(localeID, name, nameCapacity, 0, err);
2109 }
2110
2111 U_CAPI int32_t U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2112 uloc_getBaseName(const char* localeID,
2113 char* name,
2114 int32_t nameCapacity,
2115 UErrorCode* err)
2116 {
2117 return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2118 }
2119
2120 U_CAPI int32_t U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2121 uloc_canonicalize(const char* localeID,
2122 char* name,
2123 int32_t nameCapacity,
2124 UErrorCode* err)
2125 {
2126 return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2127 }
2128
2129 U_CAPI const char* U_EXPORT2
uloc_getISO3Language(const char * localeID)2130 uloc_getISO3Language(const char* localeID)
2131 {
2132 int16_t offset;
2133 char lang[ULOC_LANG_CAPACITY];
2134 UErrorCode err = U_ZERO_ERROR;
2135
2136 if (localeID == NULL)
2137 {
2138 localeID = uloc_getDefault();
2139 }
2140 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2141 if (U_FAILURE(err))
2142 return "";
2143 offset = _findIndex(LANGUAGES, lang);
2144 if (offset < 0)
2145 return "";
2146 return LANGUAGES_3[offset];
2147 }
2148
2149 U_CAPI const char* U_EXPORT2
uloc_getISO3Country(const char * localeID)2150 uloc_getISO3Country(const char* localeID)
2151 {
2152 int16_t offset;
2153 char cntry[ULOC_LANG_CAPACITY];
2154 UErrorCode err = U_ZERO_ERROR;
2155
2156 if (localeID == NULL)
2157 {
2158 localeID = uloc_getDefault();
2159 }
2160 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2161 if (U_FAILURE(err))
2162 return "";
2163 offset = _findIndex(COUNTRIES, cntry);
2164 if (offset < 0)
2165 return "";
2166
2167 return COUNTRIES_3[offset];
2168 }
2169
2170 U_CAPI uint32_t U_EXPORT2
uloc_getLCID(const char * localeID)2171 uloc_getLCID(const char* localeID)
2172 {
2173 UErrorCode status = U_ZERO_ERROR;
2174 char langID[ULOC_FULLNAME_CAPACITY];
2175 uint32_t lcid = 0;
2176
2177 /* Check for incomplete id. */
2178 if (!localeID || uprv_strlen(localeID) < 2) {
2179 return 0;
2180 }
2181
2182 // Attempt platform lookup if available
2183 lcid = uprv_convertToLCIDPlatform(localeID);
2184 if (lcid > 0)
2185 {
2186 // Windows found an LCID, return that
2187 return lcid;
2188 }
2189
2190 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2191 if (U_FAILURE(status)) {
2192 return 0;
2193 }
2194
2195 if (uprv_strchr(localeID, '@')) {
2196 // uprv_convertToLCID does not support keywords other than collation.
2197 // Remove all keywords except collation.
2198 int32_t len;
2199 char collVal[ULOC_KEYWORDS_CAPACITY];
2200 char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2201
2202 len = uloc_getKeywordValue(localeID, "collation", collVal,
2203 UPRV_LENGTHOF(collVal) - 1, &status);
2204
2205 if (U_SUCCESS(status) && len > 0) {
2206 collVal[len] = 0;
2207
2208 len = uloc_getBaseName(localeID, tmpLocaleID,
2209 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2210
2211 if (U_SUCCESS(status) && len > 0) {
2212 tmpLocaleID[len] = 0;
2213
2214 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2215 UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2216
2217 if (U_SUCCESS(status) && len > 0) {
2218 tmpLocaleID[len] = 0;
2219 return uprv_convertToLCID(langID, tmpLocaleID, &status);
2220 }
2221 }
2222 }
2223
2224 // fall through - all keywords are simply ignored
2225 status = U_ZERO_ERROR;
2226 }
2227
2228 return uprv_convertToLCID(langID, localeID, &status);
2229 }
2230
2231 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2232 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2233 UErrorCode *status)
2234 {
2235 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2236 }
2237
2238 /* ### Default locale **************************************************/
2239
2240 U_CAPI const char* U_EXPORT2
uloc_getDefault()2241 uloc_getDefault()
2242 {
2243 return locale_get_default();
2244 }
2245
2246 U_CAPI void U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2247 uloc_setDefault(const char* newDefaultLocale,
2248 UErrorCode* err)
2249 {
2250 if (U_FAILURE(*err))
2251 return;
2252 /* the error code isn't currently used for anything by this function*/
2253
2254 /* propagate change to C++ */
2255 locale_set_default(newDefaultLocale);
2256 }
2257
2258 /**
2259 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2260 * to an array of pointers to arrays of char. All of these pointers are owned
2261 * by ICU-- do not delete them, and do not write through them. The array is
2262 * terminated with a null pointer.
2263 */
2264 U_CAPI const char* const* U_EXPORT2
uloc_getISOLanguages()2265 uloc_getISOLanguages()
2266 {
2267 return LANGUAGES;
2268 }
2269
2270 /**
2271 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2272 * pointer to an array of pointers to arrays of char. All of these pointers are
2273 * owned by ICU-- do not delete them, and do not write through them. The array is
2274 * terminated with a null pointer.
2275 */
2276 U_CAPI const char* const* U_EXPORT2
uloc_getISOCountries()2277 uloc_getISOCountries()
2278 {
2279 return COUNTRIES;
2280 }
2281
2282
2283 /* this function to be moved into cstring.c later */
2284 static char gDecimal = 0;
2285
2286 static /* U_CAPI */
2287 double
2288 /* U_EXPORT2 */
_uloc_strtod(const char * start,char ** end)2289 _uloc_strtod(const char *start, char **end) {
2290 char *decimal;
2291 char *myEnd;
2292 char buf[30];
2293 double rv;
2294 if (!gDecimal) {
2295 char rep[5];
2296 /* For machines that decide to change the decimal on you,
2297 and try to be too smart with localization.
2298 This normally should be just a '.'. */
2299 sprintf(rep, "%+1.1f", 1.0);
2300 gDecimal = rep[2];
2301 }
2302
2303 if(gDecimal == '.') {
2304 return uprv_strtod(start, end); /* fall through to OS */
2305 } else {
2306 uprv_strncpy(buf, start, 29);
2307 buf[29]=0;
2308 decimal = uprv_strchr(buf, '.');
2309 if(decimal) {
2310 *decimal = gDecimal;
2311 } else {
2312 return uprv_strtod(start, end); /* no decimal point */
2313 }
2314 rv = uprv_strtod(buf, &myEnd);
2315 if(end) {
2316 *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2317 }
2318 return rv;
2319 }
2320 }
2321
2322 typedef struct {
2323 float q;
2324 int32_t dummy; /* to avoid uninitialized memory copy from qsort */
2325 char locale[ULOC_FULLNAME_CAPACITY+1];
2326 } _acceptLangItem;
2327
2328 static int32_t U_CALLCONV
uloc_acceptLanguageCompare(const void *,const void * a,const void * b)2329 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2330 {
2331 const _acceptLangItem *aa = (const _acceptLangItem*)a;
2332 const _acceptLangItem *bb = (const _acceptLangItem*)b;
2333
2334 int32_t rc = 0;
2335 if(bb->q < aa->q) {
2336 rc = -1; /* A > B */
2337 } else if(bb->q > aa->q) {
2338 rc = 1; /* A < B */
2339 } else {
2340 rc = 0; /* A = B */
2341 }
2342
2343 if(rc==0) {
2344 rc = uprv_stricmp(aa->locale, bb->locale);
2345 }
2346
2347 #if defined(ULOC_DEBUG)
2348 /* fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2349 aa->locale, aa->q,
2350 bb->locale, bb->q,
2351 rc);*/
2352 #endif
2353
2354 return rc;
2355 }
2356
2357 /*
2358 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2359 */
2360
2361 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char * httpAcceptLanguage,UEnumeration * availableLocales,UErrorCode * status)2362 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2363 const char *httpAcceptLanguage,
2364 UEnumeration* availableLocales,
2365 UErrorCode *status)
2366 {
2367 MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2368 char tmp[ULOC_FULLNAME_CAPACITY +1];
2369 int32_t n = 0;
2370 const char *itemEnd;
2371 const char *paramEnd;
2372 const char *s;
2373 const char *t;
2374 int32_t res;
2375 int32_t i;
2376 int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2377
2378 if(U_FAILURE(*status)) {
2379 return -1;
2380 }
2381
2382 for(s=httpAcceptLanguage;s&&*s;) {
2383 while(isspace(*s)) /* eat space at the beginning */
2384 s++;
2385 itemEnd=uprv_strchr(s,',');
2386 paramEnd=uprv_strchr(s,';');
2387 if(!itemEnd) {
2388 itemEnd = httpAcceptLanguage+l; /* end of string */
2389 }
2390 if(paramEnd && paramEnd<itemEnd) {
2391 /* semicolon (;) is closer than end (,) */
2392 t = paramEnd+1;
2393 if(*t=='q') {
2394 t++;
2395 }
2396 while(isspace(*t)) {
2397 t++;
2398 }
2399 if(*t=='=') {
2400 t++;
2401 }
2402 while(isspace(*t)) {
2403 t++;
2404 }
2405 items[n].q = (float)_uloc_strtod(t,NULL);
2406 } else {
2407 /* no semicolon - it's 1.0 */
2408 items[n].q = 1.0f;
2409 paramEnd = itemEnd;
2410 }
2411 items[n].dummy=0;
2412 /* eat spaces prior to semi */
2413 for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2414 ;
2415 int32_t slen = static_cast<int32_t>(((t+1)-s));
2416 if(slen > ULOC_FULLNAME_CAPACITY) {
2417 *status = U_BUFFER_OVERFLOW_ERROR;
2418 return -1; // too big
2419 }
2420 uprv_strncpy(items[n].locale, s, slen);
2421 items[n].locale[slen]=0; // terminate
2422 int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2423 if(U_FAILURE(*status)) return -1;
2424 if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2425 // canonicalization had an effect- copy back
2426 uprv_strncpy(items[n].locale, tmp, clen);
2427 items[n].locale[clen] = 0; // terminate
2428 }
2429 #if defined(ULOC_DEBUG)
2430 /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2431 #endif
2432 n++;
2433 s = itemEnd;
2434 while(*s==',') { /* eat duplicate commas */
2435 s++;
2436 }
2437 if(n>=items.getCapacity()) { // If we need more items
2438 if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2439 *status = U_MEMORY_ALLOCATION_ERROR;
2440 return -1;
2441 }
2442 #if defined(ULOC_DEBUG)
2443 fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2444 #endif
2445 }
2446 }
2447 uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2448 if (U_FAILURE(*status)) {
2449 return -1;
2450 }
2451 LocalMemory<const char*> strs(NULL);
2452 if (strs.allocateInsteadAndReset(n) == NULL) {
2453 *status = U_MEMORY_ALLOCATION_ERROR;
2454 return -1;
2455 }
2456 for(i=0;i<n;i++) {
2457 #if defined(ULOC_DEBUG)
2458 /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2459 #endif
2460 strs[i]=items[i].locale;
2461 }
2462 res = uloc_acceptLanguage(result, resultAvailable, outResult,
2463 strs.getAlias(), n, availableLocales, status);
2464 return res;
2465 }
2466
2467
2468 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char ** acceptList,int32_t acceptListCount,UEnumeration * availableLocales,UErrorCode * status)2469 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2470 UAcceptResult *outResult, const char **acceptList,
2471 int32_t acceptListCount,
2472 UEnumeration* availableLocales,
2473 UErrorCode *status)
2474 {
2475 int32_t i,j;
2476 int32_t len;
2477 int32_t maxLen=0;
2478 char tmp[ULOC_FULLNAME_CAPACITY+1];
2479 const char *l;
2480 char **fallbackList;
2481 if(U_FAILURE(*status)) {
2482 return -1;
2483 }
2484 fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2485 if(fallbackList==NULL) {
2486 *status = U_MEMORY_ALLOCATION_ERROR;
2487 return -1;
2488 }
2489 for(i=0;i<acceptListCount;i++) {
2490 #if defined(ULOC_DEBUG)
2491 fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2492 #endif
2493 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2494 #if defined(ULOC_DEBUG)
2495 fprintf(stderr," %s\n", l);
2496 #endif
2497 len = (int32_t)uprv_strlen(l);
2498 if(!uprv_strcmp(acceptList[i], l)) {
2499 if(outResult) {
2500 *outResult = ULOC_ACCEPT_VALID;
2501 }
2502 #if defined(ULOC_DEBUG)
2503 fprintf(stderr, "MATCH! %s\n", l);
2504 #endif
2505 if(len>0) {
2506 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2507 }
2508 for(j=0;j<i;j++) {
2509 uprv_free(fallbackList[j]);
2510 }
2511 uprv_free(fallbackList);
2512 return u_terminateChars(result, resultAvailable, len, status);
2513 }
2514 if(len>maxLen) {
2515 maxLen = len;
2516 }
2517 }
2518 uenum_reset(availableLocales, status);
2519 /* save off parent info */
2520 if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2521 fallbackList[i] = uprv_strdup(tmp);
2522 } else {
2523 fallbackList[i]=0;
2524 }
2525 }
2526
2527 for(maxLen--;maxLen>0;maxLen--) {
2528 for(i=0;i<acceptListCount;i++) {
2529 if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2530 #if defined(ULOC_DEBUG)
2531 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2532 #endif
2533 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2534 #if defined(ULOC_DEBUG)
2535 fprintf(stderr," %s\n", l);
2536 #endif
2537 len = (int32_t)uprv_strlen(l);
2538 if(!uprv_strcmp(fallbackList[i], l)) {
2539 if(outResult) {
2540 *outResult = ULOC_ACCEPT_FALLBACK;
2541 }
2542 #if defined(ULOC_DEBUG)
2543 fprintf(stderr, "fallback MATCH! %s\n", l);
2544 #endif
2545 if(len>0) {
2546 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2547 }
2548 for(j=0;j<acceptListCount;j++) {
2549 uprv_free(fallbackList[j]);
2550 }
2551 uprv_free(fallbackList);
2552 return u_terminateChars(result, resultAvailable, len, status);
2553 }
2554 }
2555 uenum_reset(availableLocales, status);
2556
2557 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2558 uprv_free(fallbackList[i]);
2559 fallbackList[i] = uprv_strdup(tmp);
2560 } else {
2561 uprv_free(fallbackList[i]);
2562 fallbackList[i]=0;
2563 }
2564 }
2565 }
2566 if(outResult) {
2567 *outResult = ULOC_ACCEPT_FAILED;
2568 }
2569 }
2570 for(i=0;i<acceptListCount;i++) {
2571 uprv_free(fallbackList[i]);
2572 }
2573 uprv_free(fallbackList);
2574 return -1;
2575 }
2576
2577 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2578 uloc_toUnicodeLocaleKey(const char* keyword)
2579 {
2580 const char* bcpKey = ulocimp_toBcpKey(keyword);
2581 if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2582 // unknown keyword, but syntax is fine..
2583 return keyword;
2584 }
2585 return bcpKey;
2586 }
2587
2588 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2589 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2590 {
2591 const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2592 if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2593 // unknown keyword, but syntax is fine..
2594 return value;
2595 }
2596 return bcpType;
2597 }
2598
2599 static UBool
isWellFormedLegacyKey(const char * legacyKey)2600 isWellFormedLegacyKey(const char* legacyKey)
2601 {
2602 const char* p = legacyKey;
2603 while (*p) {
2604 if (!UPRV_ISALPHANUM(*p)) {
2605 return FALSE;
2606 }
2607 p++;
2608 }
2609 return TRUE;
2610 }
2611
2612 static UBool
isWellFormedLegacyType(const char * legacyType)2613 isWellFormedLegacyType(const char* legacyType)
2614 {
2615 const char* p = legacyType;
2616 int32_t alphaNumLen = 0;
2617 while (*p) {
2618 if (*p == '_' || *p == '/' || *p == '-') {
2619 if (alphaNumLen == 0) {
2620 return FALSE;
2621 }
2622 alphaNumLen = 0;
2623 } else if (UPRV_ISALPHANUM(*p)) {
2624 alphaNumLen++;
2625 } else {
2626 return FALSE;
2627 }
2628 p++;
2629 }
2630 return (alphaNumLen != 0);
2631 }
2632
2633 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2634 uloc_toLegacyKey(const char* keyword)
2635 {
2636 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2637 if (legacyKey == NULL) {
2638 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2639 //
2640 // Note:
2641 // LDML/CLDR provides some definition of keyword syntax in
2642 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2643 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2644 // Keys can only consist of [0-9a-zA-Z].
2645 if (isWellFormedLegacyKey(keyword)) {
2646 return keyword;
2647 }
2648 }
2649 return legacyKey;
2650 }
2651
2652 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2653 uloc_toLegacyType(const char* keyword, const char* value)
2654 {
2655 const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2656 if (legacyType == NULL) {
2657 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2658 //
2659 // Note:
2660 // LDML/CLDR provides some definition of keyword syntax in
2661 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2662 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2663 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2664 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2665 if (isWellFormedLegacyType(value)) {
2666 return value;
2667 }
2668 }
2669 return legacyType;
2670 }
2671
2672 /*eof*/
2673