• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1997-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   04/01/97    aliu        Creation.
15 *   08/21/98    stephen     JDK 1.2 sync
16 *   12/08/98    rtg         New Locale implementation and C API
17 *   03/15/99    damiba      overhaul.
18 *   04/06/99    stephen     changed setDefault() to realloc and copy
19 *   06/14/99    stephen     Changed calls to ures_open for new params
20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22 *                           brought canonicalization code into line with spec
23 *****************************************************************************/
24 
25 /*
26    POSIX's locale format, from putil.c: [no spaces]
27 
28      ll [ _CC ] [ . MM ] [ @ VV]
29 
30      l = lang, C = ctry, M = charmap, V = variant
31 */
32 
33 #include "unicode/bytestream.h"
34 #include "unicode/errorcode.h"
35 #include "unicode/stringpiece.h"
36 #include "unicode/utypes.h"
37 #include "unicode/ustring.h"
38 #include "unicode/uloc.h"
39 
40 #include "bytesinkutil.h"
41 #include "putilimp.h"
42 #include "ustr_imp.h"
43 #include "ulocimp.h"
44 #include "umutex.h"
45 #include "cstring.h"
46 #include "cmemory.h"
47 #include "locmap.h"
48 #include "uarrsort.h"
49 #include "uenumimp.h"
50 #include "uassert.h"
51 #include "charstr.h"
52 
53 #include <algorithm>
54 #include <stdio.h> /* for sprintf */
55 
56 U_NAMESPACE_USE
57 
58 /* ### Declarations **************************************************/
59 
60 /* Locale stuff from locid.cpp */
61 U_CFUNC void locale_set_default(const char *id);
62 U_CFUNC const char *locale_get_default(void);
63 U_CFUNC int32_t
64 locale_getKeywords(const char *localeID,
65             char prev,
66             char *keywords, int32_t keywordCapacity,
67             UBool valuesToo,
68             UErrorCode *status);
69 
70 /* ### Data tables **************************************************/
71 
72 /**
73  * Table of language codes, both 2- and 3-letter, with preference
74  * given to 2-letter codes where possible.  Includes 3-letter codes
75  * that lack a 2-letter equivalent.
76  *
77  * This list must be in sorted order.  This list is returned directly
78  * to the user by some API.
79  *
80  * This list must be kept in sync with LANGUAGES_3, with corresponding
81  * entries matched.
82  *
83  * This table should be terminated with a NULL entry, followed by a
84  * second list, and another NULL entry.  The first list is visible to
85  * user code when this array is returned by API.  The second list
86  * contains codes we support, but do not expose through user API.
87  *
88  * Notes
89  *
90  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
91  * include the revisions up to 2001/7/27 *CWB*
92  *
93  * The 3 character codes are the terminology codes like RFC 3066.  This
94  * is compatible with prior ICU codes
95  *
96  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
97  * table but now at the end of the table because 3 character codes are
98  * duplicates.  This avoids bad searches going from 3 to 2 character
99  * codes.
100  *
101  * The range qaa-qtz is reserved for local use
102  */
103 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
104 /* ISO639 table version is 20150505 */
105 /* Subsequent hand addition of selected languages */
106 static const char * const LANGUAGES[] = {
107     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
108     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
109     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
110     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
111     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
112     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
113     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
114     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
115     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
116     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
117     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
118     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
119     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
120     "cs",  "csb", "cu",  "cv",  "cy",
121     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
122     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
123     "dyo", "dyu", "dz",  "dzg",
124     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
125     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
126     "ext",
127     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
128     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
129     "frs", "fur", "fy",
130     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
131     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
132     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
133     "gur", "guz", "gv",  "gwi",
134     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
135     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
136     "hup", "hy",  "hz",
137     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
138     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
139     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
140     "jv",
141     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
142     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
143     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
144     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
145     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
146     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
147     "kv",  "kw",  "ky",
148     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
149     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
150     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
151     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
152     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
153     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
154     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
155     "ml",  "mn",  "mnc", "mni", "mo",
156     "moh", "mos", "mr",  "mrj",
157     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
158     "my",  "mye", "myv", "mzn",
159     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
160     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
161     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
162     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
163     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
164     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
165     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
166     "pon", "prg", "pro", "ps",  "pt",
167     "qu",  "quc", "qug",
168     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
169     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
170     "rw",  "rwk",
171     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
172     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
173     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
174     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
175     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
176     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
177     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
178     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
179     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
180     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
181     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
182     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
183     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
184     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
185     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
186     "vot", "vro", "vun",
187     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
188     "xal", "xh",  "xmf", "xog",
189     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
190     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
191     "zun", "zxx", "zza",
192 NULL,
193     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
194 NULL
195 };
196 
197 static const char* const DEPRECATED_LANGUAGES[]={
198     "in", "iw", "ji", "jw", NULL, NULL
199 };
200 static const char* const REPLACEMENT_LANGUAGES[]={
201     "id", "he", "yi", "jv", NULL, NULL
202 };
203 
204 /**
205  * Table of 3-letter language codes.
206  *
207  * This is a lookup table used to convert 3-letter language codes to
208  * their 2-letter equivalent, where possible.  It must be kept in sync
209  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
210  * same language as LANGUAGES_3[i].  The commented-out lines are
211  * copied from LANGUAGES to make eyeballing this baby easier.
212  *
213  * Where a 3-letter language code has no 2-letter equivalent, the
214  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
215  *
216  * This table should be terminated with a NULL entry, followed by a
217  * second list, and another NULL entry.  The two lists correspond to
218  * the two lists in LANGUAGES.
219  */
220 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
221 /* ISO639 table version is 20150505 */
222 /* Subsequent hand addition of selected languages */
223 static const char * const LANGUAGES_3[] = {
224     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
225     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
226     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
227     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
228     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
229     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
230     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
231     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
232     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
233     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
234     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
235     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
236     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
237     "ces", "csb", "chu", "chv", "cym",
238     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
239     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
240     "dyo", "dyu", "dzo", "dzg",
241     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
242     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
243     "ext",
244     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
245     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
246     "frs", "fur", "fry",
247     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
248     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
249     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
250     "gur", "guz", "glv", "gwi",
251     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
252     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
253     "hup", "hye", "her",
254     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
255     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
256     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
257     "jav",
258     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
259     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
260     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
261     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
262     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
263     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
264     "kom", "cor", "kir",
265     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
266     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
267     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
268     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
269     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
270     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
271     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
272     "mal", "mon", "mnc", "mni", "mol",
273     "moh", "mos", "mar", "mrj",
274     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
275     "mya", "mye", "myv", "mzn",
276     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
277     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
278     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
279     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
280     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
281     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
282     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
283     "pon", "prg", "pro", "pus", "por",
284     "que", "quc", "qug",
285     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
286     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
287     "kin", "rwk",
288     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
289     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
290     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
291     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
292     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
293     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
294     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
295     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
296     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
297     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
298     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
299     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
300     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
301     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
302     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
303     "vot", "vro", "vun",
304     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
305     "xal", "xho", "xmf", "xog",
306     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
307     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
308     "zun", "zxx", "zza",
309 NULL,
310 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
311     "ind", "heb", "yid", "jaw", "srp",
312 NULL
313 };
314 
315 /**
316  * Table of 2-letter country codes.
317  *
318  * This list must be in sorted order.  This list is returned directly
319  * to the user by some API.
320  *
321  * This list must be kept in sync with COUNTRIES_3, with corresponding
322  * entries matched.
323  *
324  * This table should be terminated with a NULL entry, followed by a
325  * second list, and another NULL entry.  The first list is visible to
326  * user code when this array is returned by API.  The second list
327  * contains codes we support, but do not expose through user API.
328  *
329  * Notes:
330  *
331  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
332  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
333  * new codes keeping the old ones for compatibility updated to include
334  * 1999/12/03 revisions *CWB*
335  *
336  * RO(ROM) is now RO(ROU) according to
337  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
338  */
339 static const char * const COUNTRIES[] = {
340     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
341     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
342     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
343     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
344     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
345     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
346     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
347     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
348     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
349     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
350     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
351     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
352     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
353     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
354     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
355     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
356     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
357     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
358     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
359     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
360     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
361     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
362     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
363     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
364     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
365     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
366     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
367     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
368     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
369     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
370 NULL,
371     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
372 NULL
373 };
374 
375 static const char* const DEPRECATED_COUNTRIES[] = {
376     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
377 };
378 static const char* const REPLACEMENT_COUNTRIES[] = {
379 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
380     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
381 };
382 
383 /**
384  * Table of 3-letter country codes.
385  *
386  * This is a lookup table used to convert 3-letter country codes to
387  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
388  * For all valid i, COUNTRIES[i] must refer to the same country as
389  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
390  * to make eyeballing this baby easier.
391  *
392  * This table should be terminated with a NULL entry, followed by a
393  * second list, and another NULL entry.  The two lists correspond to
394  * the two lists in COUNTRIES.
395  */
396 static const char * const COUNTRIES_3[] = {
397 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
398     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
399 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
400     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
401 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
402     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
403 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
404     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
405 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
406     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
407 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
408     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
409 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
410     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
411 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
412     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
413 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
414     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
415 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
416     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
417 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
418     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
419 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
420     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
421 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
422     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
423 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
424     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
425 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
426     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
427 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
428     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
429 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
430     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
431 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
432     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
433 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
434     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
435 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
436     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
437 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
438     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
439 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
440     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
441 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
442     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
443 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
444     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
445 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
446     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
447 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
448     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
449 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
450     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
451 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
452     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
453 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
454     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
455 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
456     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
457 NULL,
458 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
459     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
460 NULL
461 };
462 
463 typedef struct CanonicalizationMap {
464     const char *id;          /* input ID */
465     const char *canonicalID; /* canonicalized output ID */
466 } CanonicalizationMap;
467 
468 /**
469  * A map to canonicalize locale IDs.  This handles a variety of
470  * different semantic kinds of transformations.
471  */
472 static const CanonicalizationMap CANONICALIZE_MAP[] = {
473     { "art__LOJBAN",    "jbo" }, /* registered name */
474     { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
475     { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
476     { "zh__GUOYU",      "zh" }, /* registered name */
477     { "zh__HAKKA",      "hak" }, /* registered name */
478     { "zh__XIANG",      "hsn" }, /* registered name */
479     // subtags with 3 chars won't be treated as variants.
480     { "zh_GAN",         "gan" }, /* registered name */
481     { "zh_MIN_NAN",     "nan" }, /* registered name */
482     { "zh_WUU",         "wuu" }, /* registered name */
483     { "zh_YUE",         "yue" }, /* registered name */
484 };
485 
486 /* ### BCP47 Conversion *******************************************/
487 /* Test if the locale id has BCP47 u extension and does not have '@' */
488 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
489 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
490 #define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
491     if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
492             U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
493         finalID=id; \
494         if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
495     } else { \
496         finalID=buffer; \
497     } \
498 } UPRV_BLOCK_MACRO_END
499 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)500 static int32_t getShortestSubtagLength(const char *localeID) {
501     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
502     int32_t length = localeIDLength;
503     int32_t tmpLength = 0;
504     int32_t i;
505     UBool reset = TRUE;
506 
507     for (i = 0; i < localeIDLength; i++) {
508         if (localeID[i] != '_' && localeID[i] != '-') {
509             if (reset) {
510                 tmpLength = 0;
511                 reset = FALSE;
512             }
513             tmpLength++;
514         } else {
515             if (tmpLength != 0 && tmpLength < length) {
516                 length = tmpLength;
517             }
518             reset = TRUE;
519         }
520     }
521 
522     return length;
523 }
524 
525 /* ### Keywords **************************************************/
526 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
527 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
528 /* Punctuation/symbols allowed in legacy key values */
529 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
530 
531 #define ULOC_KEYWORD_BUFFER_LEN 25
532 #define ULOC_MAX_NO_KEYWORDS 25
533 
534 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)535 locale_getKeywordsStart(const char *localeID) {
536     const char *result = NULL;
537     if((result = uprv_strchr(localeID, '@')) != NULL) {
538         return result;
539     }
540 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
541     else {
542         /* We do this because the @ sign is variant, and the @ sign used on one
543         EBCDIC machine won't be compiled the same way on other EBCDIC based
544         machines. */
545         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
546         const uint8_t *charToFind = ebcdicSigns;
547         while(*charToFind) {
548             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
549                 return result;
550             }
551             charToFind++;
552         }
553     }
554 #endif
555     return NULL;
556 }
557 
558 /**
559  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
560  * @param keywordName incoming name to be canonicalized
561  * @param status return status (keyword too long)
562  * @return length of the keyword name
563  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)564 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
565 {
566   int32_t keywordNameLen = 0;
567 
568   for (; *keywordName != 0; keywordName++) {
569     if (!UPRV_ISALPHANUM(*keywordName)) {
570       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
571       return 0;
572     }
573     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
574       buf[keywordNameLen++] = uprv_tolower(*keywordName);
575     } else {
576       /* keyword name too long for internal buffer */
577       *status = U_INTERNAL_PROGRAM_ERROR;
578       return 0;
579     }
580   }
581   if (keywordNameLen == 0) {
582     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
583     return 0;
584   }
585   buf[keywordNameLen] = 0; /* terminate */
586 
587   return keywordNameLen;
588 }
589 
590 typedef struct {
591     char keyword[ULOC_KEYWORD_BUFFER_LEN];
592     int32_t keywordLen;
593     const char *valueStart;
594     int32_t valueLen;
595 } KeywordStruct;
596 
597 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)598 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
599     const char* leftString = ((const KeywordStruct *)left)->keyword;
600     const char* rightString = ((const KeywordStruct *)right)->keyword;
601     return uprv_strcmp(leftString, rightString);
602 }
603 
604 static void
_getKeywords(const char * localeID,char prev,ByteSink & sink,UBool valuesToo,UErrorCode * status)605 _getKeywords(const char *localeID,
606              char prev,
607              ByteSink& sink,
608              UBool valuesToo,
609              UErrorCode *status)
610 {
611     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
612 
613     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
614     int32_t numKeywords = 0;
615     const char* pos = localeID;
616     const char* equalSign = NULL;
617     const char* semicolon = NULL;
618     int32_t i = 0, j, n;
619 
620     if(prev == '@') { /* start of keyword definition */
621         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
622         do {
623             UBool duplicate = FALSE;
624             /* skip leading spaces */
625             while(*pos == ' ') {
626                 pos++;
627             }
628             if (!*pos) { /* handle trailing "; " */
629                 break;
630             }
631             if(numKeywords == maxKeywords) {
632                 *status = U_INTERNAL_PROGRAM_ERROR;
633                 return;
634             }
635             equalSign = uprv_strchr(pos, '=');
636             semicolon = uprv_strchr(pos, ';');
637             /* lack of '=' [foo@currency] is illegal */
638             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
639             if(!equalSign || (semicolon && semicolon<equalSign)) {
640                 *status = U_INVALID_FORMAT_ERROR;
641                 return;
642             }
643             /* need to normalize both keyword and keyword name */
644             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
645                 /* keyword name too long for internal buffer */
646                 *status = U_INTERNAL_PROGRAM_ERROR;
647                 return;
648             }
649             for(i = 0, n = 0; i < equalSign - pos; ++i) {
650                 if (pos[i] != ' ') {
651                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
652                 }
653             }
654 
655             /* zero-length keyword is an error. */
656             if (n == 0) {
657                 *status = U_INVALID_FORMAT_ERROR;
658                 return;
659             }
660 
661             keywordList[numKeywords].keyword[n] = 0;
662             keywordList[numKeywords].keywordLen = n;
663             /* now grab the value part. First we skip the '=' */
664             equalSign++;
665             /* then we leading spaces */
666             while(*equalSign == ' ') {
667                 equalSign++;
668             }
669 
670             /* Premature end or zero-length value */
671             if (!*equalSign || equalSign == semicolon) {
672                 *status = U_INVALID_FORMAT_ERROR;
673                 return;
674             }
675 
676             keywordList[numKeywords].valueStart = equalSign;
677 
678             pos = semicolon;
679             i = 0;
680             if(pos) {
681                 while(*(pos - i - 1) == ' ') {
682                     i++;
683                 }
684                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
685                 pos++;
686             } else {
687                 i = (int32_t)uprv_strlen(equalSign);
688                 while(i && equalSign[i-1] == ' ') {
689                     i--;
690                 }
691                 keywordList[numKeywords].valueLen = i;
692             }
693             /* If this is a duplicate keyword, then ignore it */
694             for (j=0; j<numKeywords; ++j) {
695                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
696                     duplicate = TRUE;
697                     break;
698                 }
699             }
700             if (!duplicate) {
701                 ++numKeywords;
702             }
703         } while(pos);
704 
705         /* now we have a list of keywords */
706         /* we need to sort it */
707         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
708 
709         /* Now construct the keyword part */
710         for(i = 0; i < numKeywords; i++) {
711             sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
712             if(valuesToo) {
713                 sink.Append("=", 1);
714                 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
715                 if(i < numKeywords - 1) {
716                     sink.Append(";", 1);
717                 }
718             } else {
719                 sink.Append("\0", 1);
720             }
721         }
722     }
723 }
724 
725 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,UBool valuesToo,UErrorCode * status)726 locale_getKeywords(const char *localeID,
727                    char prev,
728                    char *keywords, int32_t keywordCapacity,
729                    UBool valuesToo,
730                    UErrorCode *status) {
731     if (U_FAILURE(*status)) {
732         return 0;
733     }
734 
735     CheckedArrayByteSink sink(keywords, keywordCapacity);
736     _getKeywords(localeID, prev, sink, valuesToo, status);
737 
738     int32_t reslen = sink.NumberOfBytesAppended();
739 
740     if (U_FAILURE(*status)) {
741         return reslen;
742     }
743 
744     if (sink.Overflowed()) {
745         *status = U_BUFFER_OVERFLOW_ERROR;
746     } else {
747         u_terminateChars(keywords, keywordCapacity, reslen, status);
748     }
749 
750     return reslen;
751 }
752 
753 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)754 uloc_getKeywordValue(const char* localeID,
755                      const char* keywordName,
756                      char* buffer, int32_t bufferCapacity,
757                      UErrorCode* status)
758 {
759     if (buffer != nullptr) {
760         buffer[0] = '\0';
761     }
762     const char* startSearchHere = NULL;
763     const char* nextSeparator = NULL;
764     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
765     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
766     int32_t result = 0;
767 
768     if(status && U_SUCCESS(*status) && localeID) {
769       char tempBuffer[ULOC_FULLNAME_CAPACITY];
770       const char* tmpLocaleID;
771 
772       if (keywordName == NULL || keywordName[0] == 0) {
773         *status = U_ILLEGAL_ARGUMENT_ERROR;
774         return 0;
775       }
776 
777       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
778       if(U_FAILURE(*status)) {
779         return 0;
780       }
781 
782       if (_hasBCP47Extension(localeID)) {
783           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
784       } else {
785           tmpLocaleID=localeID;
786       }
787 
788       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
789       if(startSearchHere == NULL) {
790           /* no keywords, return at once */
791           return 0;
792       }
793 
794       /* find the first keyword */
795       while(startSearchHere) {
796           const char* keyValueTail;
797           int32_t keyValueLen;
798 
799           startSearchHere++; /* skip @ or ; */
800           nextSeparator = uprv_strchr(startSearchHere, '=');
801           if(!nextSeparator) {
802               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
803               return 0;
804           }
805           /* strip leading & trailing spaces (TC decided to tolerate these) */
806           while(*startSearchHere == ' ') {
807               startSearchHere++;
808           }
809           keyValueTail = nextSeparator;
810           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
811               keyValueTail--;
812           }
813           /* now keyValueTail points to first char after the keyName */
814           /* copy & normalize keyName from locale */
815           if (startSearchHere == keyValueTail) {
816               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
817               return 0;
818           }
819           keyValueLen = 0;
820           while (startSearchHere < keyValueTail) {
821             if (!UPRV_ISALPHANUM(*startSearchHere)) {
822               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
823               return 0;
824             }
825             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
826               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
827             } else {
828               /* keyword name too long for internal buffer */
829               *status = U_INTERNAL_PROGRAM_ERROR;
830               return 0;
831             }
832           }
833           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
834 
835           startSearchHere = uprv_strchr(nextSeparator, ';');
836 
837           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
838                /* current entry matches the keyword. */
839              nextSeparator++; /* skip '=' */
840               /* First strip leading & trailing spaces (TC decided to tolerate these) */
841               while(*nextSeparator == ' ') {
842                 nextSeparator++;
843               }
844               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
845               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
846                 keyValueTail--;
847               }
848               /* Now copy the value, but check well-formedness */
849               if (nextSeparator == keyValueTail) {
850                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
851                 return 0;
852               }
853               keyValueLen = 0;
854               while (nextSeparator < keyValueTail) {
855                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
856                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
857                   return 0;
858                 }
859                 if (keyValueLen < bufferCapacity) {
860                   /* Should we lowercase value to return here? Tests expect as-is. */
861                   buffer[keyValueLen++] = *nextSeparator++;
862                 } else { /* keep advancing so we return correct length in case of overflow */
863                   keyValueLen++;
864                   nextSeparator++;
865                 }
866               }
867               result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
868               return result;
869           }
870       }
871     }
872     return 0;
873 }
874 
875 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)876 uloc_setKeywordValue(const char* keywordName,
877                      const char* keywordValue,
878                      char* buffer, int32_t bufferCapacity,
879                      UErrorCode* status)
880 {
881     /* TODO: sorting. removal. */
882     int32_t keywordNameLen;
883     int32_t keywordValueLen;
884     int32_t bufLen;
885     int32_t needLen = 0;
886     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
887     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
888     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
889     int32_t rc;
890     char* nextSeparator = NULL;
891     char* nextEqualsign = NULL;
892     char* startSearchHere = NULL;
893     char* keywordStart = NULL;
894     CharString updatedKeysAndValues;
895     int32_t updatedKeysAndValuesLen;
896     UBool handledInputKeyAndValue = FALSE;
897     char keyValuePrefix = '@';
898 
899     if(U_FAILURE(*status)) {
900         return -1;
901     }
902     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
903         *status = U_ILLEGAL_ARGUMENT_ERROR;
904         return 0;
905     }
906     bufLen = (int32_t)uprv_strlen(buffer);
907     if(bufferCapacity<bufLen) {
908         /* The capacity is less than the length?! Is this NULL terminated? */
909         *status = U_ILLEGAL_ARGUMENT_ERROR;
910         return 0;
911     }
912     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
913     if(U_FAILURE(*status)) {
914         return 0;
915     }
916 
917     keywordValueLen = 0;
918     if(keywordValue) {
919         while (*keywordValue != 0) {
920             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
921                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
922                 return 0;
923             }
924             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
925                 /* Should we force lowercase in value to set? */
926                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
927             } else {
928                 /* keywordValue too long for internal buffer */
929                 *status = U_INTERNAL_PROGRAM_ERROR;
930                 return 0;
931             }
932         }
933     }
934     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
935 
936     startSearchHere = (char*)locale_getKeywordsStart(buffer);
937     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
938         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
939             return bufLen;
940         }
941 
942         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
943         if(startSearchHere) { /* had a single @ */
944             needLen--; /* already had the @ */
945             /* startSearchHere points at the @ */
946         } else {
947             startSearchHere=buffer+bufLen;
948         }
949         if(needLen >= bufferCapacity) {
950             *status = U_BUFFER_OVERFLOW_ERROR;
951             return needLen; /* no change */
952         }
953         *startSearchHere++ = '@';
954         uprv_strcpy(startSearchHere, keywordNameBuffer);
955         startSearchHere += keywordNameLen;
956         *startSearchHere++ = '=';
957         uprv_strcpy(startSearchHere, keywordValueBuffer);
958         return needLen;
959     } /* end shortcut - no @ */
960 
961     keywordStart = startSearchHere;
962     /* search for keyword */
963     while(keywordStart) {
964         const char* keyValueTail;
965         int32_t keyValueLen;
966 
967         keywordStart++; /* skip @ or ; */
968         nextEqualsign = uprv_strchr(keywordStart, '=');
969         if (!nextEqualsign) {
970             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
971             return 0;
972         }
973         /* strip leading & trailing spaces (TC decided to tolerate these) */
974         while(*keywordStart == ' ') {
975             keywordStart++;
976         }
977         keyValueTail = nextEqualsign;
978         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
979             keyValueTail--;
980         }
981         /* now keyValueTail points to first char after the keyName */
982         /* copy & normalize keyName from locale */
983         if (keywordStart == keyValueTail) {
984             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
985             return 0;
986         }
987         keyValueLen = 0;
988         while (keywordStart < keyValueTail) {
989             if (!UPRV_ISALPHANUM(*keywordStart)) {
990                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
991                 return 0;
992             }
993             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
994                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
995             } else {
996                 /* keyword name too long for internal buffer */
997                 *status = U_INTERNAL_PROGRAM_ERROR;
998                 return 0;
999             }
1000         }
1001         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1002 
1003         nextSeparator = uprv_strchr(nextEqualsign, ';');
1004 
1005         /* start processing the value part */
1006         nextEqualsign++; /* skip '=' */
1007         /* First strip leading & trailing spaces (TC decided to tolerate these) */
1008         while(*nextEqualsign == ' ') {
1009             nextEqualsign++;
1010         }
1011         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1012         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1013             keyValueTail--;
1014         }
1015         if (nextEqualsign == keyValueTail) {
1016             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1017             return 0;
1018         }
1019 
1020         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1021         if(rc == 0) {
1022             /* Current entry matches the input keyword. Update the entry */
1023             if(keywordValueLen > 0) { /* updating a value */
1024                 updatedKeysAndValues.append(keyValuePrefix, *status);
1025                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1026                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1027                 updatedKeysAndValues.append('=', *status);
1028                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1029             } /* else removing this entry, don't emit anything */
1030             handledInputKeyAndValue = TRUE;
1031         } else {
1032            /* input keyword sorts earlier than current entry, add before current entry */
1033             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1034                 /* insert new entry at this location */
1035                 updatedKeysAndValues.append(keyValuePrefix, *status);
1036                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1037                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1038                 updatedKeysAndValues.append('=', *status);
1039                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1040                 handledInputKeyAndValue = TRUE;
1041             }
1042             /* copy the current entry */
1043             updatedKeysAndValues.append(keyValuePrefix, *status);
1044             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1045             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1046             updatedKeysAndValues.append('=', *status);
1047             updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1048         }
1049         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1050             /* append new entry at the end, it sorts later than existing entries */
1051             updatedKeysAndValues.append(keyValuePrefix, *status);
1052             /* skip keyValuePrefix update, no subsequent key-value pair */
1053             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1054             updatedKeysAndValues.append('=', *status);
1055             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1056             handledInputKeyAndValue = TRUE;
1057         }
1058         keywordStart = nextSeparator;
1059     } /* end loop searching */
1060 
1061     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1062      * problems with the passed-in locale. So if we did encounter problems with the
1063      * passed-in locale above, those errors took precedence and overrode any error
1064      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1065      * are errors here they are from updatedKeysAndValues.append; they do cause an
1066      * error return but the passed-in locale is unmodified and the original bufLen is
1067      * returned.
1068      */
1069     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1070         /* if input key/value specified removal of a keyword not present in locale, or
1071          * there was an error in CharString.append, leave original locale alone. */
1072         return bufLen;
1073     }
1074 
1075     updatedKeysAndValuesLen = updatedKeysAndValues.length();
1076     /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1077     needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1078     if(needLen >= bufferCapacity) {
1079         *status = U_BUFFER_OVERFLOW_ERROR;
1080         return needLen; /* no change */
1081     }
1082     if (updatedKeysAndValuesLen > 0) {
1083         uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1084     }
1085     buffer[needLen]=0;
1086     return needLen;
1087 }
1088 
1089 /* ### ID parsing implementation **************************************************/
1090 
1091 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1092 
1093 /*returns TRUE if one of the special prefixes is here (s=string)
1094   'x-' or 'i-' */
1095 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1096 
1097 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1098  * except for variant
1099  */
1100 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1101 
1102 /**
1103  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1104  * a NULL entry, followed by more entries, and a second NULL entry.
1105  *
1106  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1107  * COUNTRIES_3.
1108  */
_findIndex(const char * const * list,const char * key)1109 static int16_t _findIndex(const char* const* list, const char* key)
1110 {
1111     const char* const* anchor = list;
1112     int32_t pass = 0;
1113 
1114     /* Make two passes through two NULL-terminated arrays at 'list' */
1115     while (pass++ < 2) {
1116         while (*list) {
1117             if (uprv_strcmp(key, *list) == 0) {
1118                 return (int16_t)(list - anchor);
1119             }
1120             list++;
1121         }
1122         ++list;     /* skip final NULL *CWB*/
1123     }
1124     return -1;
1125 }
1126 
1127 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1128 uloc_getCurrentCountryID(const char* oldID){
1129     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1130     if (offset >= 0) {
1131         return REPLACEMENT_COUNTRIES[offset];
1132     }
1133     return oldID;
1134 }
1135 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1136 uloc_getCurrentLanguageID(const char* oldID){
1137     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1138     if (offset >= 0) {
1139         return REPLACEMENT_LANGUAGES[offset];
1140     }
1141     return oldID;
1142 }
1143 /*
1144  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1145  * avoid duplicating code to handle the earlier locale ID pieces
1146  * in the functions for the later ones by
1147  * setting the *pEnd pointer to where they stopped parsing
1148  *
1149  * TODO try to use this in Locale
1150  */
1151 static CharString
ulocimp_getLanguage(const char * localeID,const char ** pEnd,UErrorCode & status)1152 ulocimp_getLanguage(const char *localeID,
1153                     const char **pEnd,
1154                     UErrorCode &status) {
1155     CharString result;
1156 
1157     if (uprv_stricmp(localeID, "root") == 0) {
1158         localeID += 4;
1159     } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1160                (localeID[3] == '\0' ||
1161                 localeID[3] == '-' ||
1162                 localeID[3] == '_' ||
1163                 localeID[3] == '@')) {
1164         localeID += 3;
1165     }
1166 
1167     /* if it starts with i- or x- then copy that prefix */
1168     if(_isIDPrefix(localeID)) {
1169         result.append((char)uprv_tolower(*localeID), status);
1170         result.append('-', status);
1171         localeID+=2;
1172     }
1173 
1174     /* copy the language as far as possible and count its length */
1175     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1176         result.append((char)uprv_tolower(*localeID), status);
1177         localeID++;
1178     }
1179 
1180     if(result.length()==3) {
1181         /* convert 3 character code to 2 character code if possible *CWB*/
1182         int32_t offset = _findIndex(LANGUAGES_3, result.data());
1183         if(offset>=0) {
1184             result.clear();
1185             result.append(LANGUAGES[offset], status);
1186         }
1187     }
1188 
1189     if(pEnd!=NULL) {
1190         *pEnd=localeID;
1191     }
1192 
1193     return result;
1194 }
1195 
1196 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1197 ulocimp_getLanguage(const char *localeID,
1198                     char *language, int32_t languageCapacity,
1199                     const char **pEnd) {
1200     ErrorCode status;
1201     CharString result = ulocimp_getLanguage(localeID, pEnd, status);
1202     if (status.isFailure()) {
1203         return 0;
1204     }
1205     int32_t reslen = result.length();
1206     uprv_memcpy(language, result.data(), std::min(reslen, languageCapacity));
1207     return reslen;
1208 }
1209 
1210 static CharString
ulocimp_getScript(const char * localeID,const char ** pEnd,UErrorCode & status)1211 ulocimp_getScript(const char *localeID,
1212                   const char **pEnd,
1213                   UErrorCode &status) {
1214     CharString result;
1215     int32_t idLen = 0;
1216 
1217     if (pEnd != NULL) {
1218         *pEnd = localeID;
1219     }
1220 
1221     /* copy the second item as far as possible and count its length */
1222     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1223             && uprv_isASCIILetter(localeID[idLen])) {
1224         idLen++;
1225     }
1226 
1227     /* If it's exactly 4 characters long, then it's a script and not a country. */
1228     if (idLen == 4) {
1229         int32_t i;
1230         if (pEnd != NULL) {
1231             *pEnd = localeID+idLen;
1232         }
1233         if (idLen >= 1) {
1234             result.append((char)uprv_toupper(*(localeID++)), status);
1235         }
1236         for (i = 1; i < idLen; i++) {
1237             result.append((char)uprv_tolower(*(localeID++)), status);
1238         }
1239     }
1240 
1241     return result;
1242 }
1243 
1244 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1245 ulocimp_getScript(const char *localeID,
1246                   char *script, int32_t scriptCapacity,
1247                   const char **pEnd) {
1248     ErrorCode status;
1249     CharString result = ulocimp_getScript(localeID, pEnd, status);
1250     if (status.isFailure()) {
1251         return 0;
1252     }
1253     int32_t reslen = result.length();
1254     uprv_memcpy(script, result.data(), std::min(reslen, scriptCapacity));
1255     return reslen;
1256 }
1257 
1258 static CharString
ulocimp_getCountry(const char * localeID,const char ** pEnd,UErrorCode & status)1259 ulocimp_getCountry(const char *localeID,
1260                    const char **pEnd,
1261                    UErrorCode &status) {
1262     CharString result;
1263     int32_t idLen=0;
1264 
1265     /* copy the country as far as possible and count its length */
1266     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1267         result.append((char)uprv_toupper(localeID[idLen]), status);
1268         idLen++;
1269     }
1270 
1271     /* the country should be either length 2 or 3 */
1272     if (idLen == 2 || idLen == 3) {
1273         /* convert 3 character code to 2 character code if possible *CWB*/
1274         if(idLen==3) {
1275             int32_t offset = _findIndex(COUNTRIES_3, result.data());
1276             if(offset>=0) {
1277                 result.clear();
1278                 result.append(COUNTRIES[offset], status);
1279             }
1280         }
1281         localeID+=idLen;
1282     } else {
1283         result.clear();
1284     }
1285 
1286     if(pEnd!=NULL) {
1287         *pEnd=localeID;
1288     }
1289 
1290     return result;
1291 }
1292 
1293 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1294 ulocimp_getCountry(const char *localeID,
1295                    char *country, int32_t countryCapacity,
1296                    const char **pEnd) {
1297     ErrorCode status;
1298     CharString result = ulocimp_getCountry(localeID, pEnd, status);
1299     if (status.isFailure()) {
1300         return 0;
1301     }
1302     int32_t reslen = result.length();
1303     uprv_memcpy(country, result.data(), std::min(reslen, countryCapacity));
1304     return reslen;
1305 }
1306 
1307 /**
1308  * @param needSeparator if true, then add leading '_' if any variants
1309  * are added to 'variant'
1310  */
1311 static void
_getVariantEx(const char * localeID,char prev,ByteSink & sink,UBool needSeparator)1312 _getVariantEx(const char *localeID,
1313               char prev,
1314               ByteSink& sink,
1315               UBool needSeparator) {
1316     UBool hasVariant = FALSE;
1317 
1318     /* get one or more variant tags and separate them with '_' */
1319     if(_isIDSeparator(prev)) {
1320         /* get a variant string after a '-' or '_' */
1321         while(!_isTerminator(*localeID)) {
1322             if (needSeparator) {
1323                 sink.Append("_", 1);
1324                 needSeparator = FALSE;
1325             }
1326             char c = (char)uprv_toupper(*localeID);
1327             if (c == '-') c = '_';
1328             sink.Append(&c, 1);
1329             hasVariant = TRUE;
1330             localeID++;
1331         }
1332     }
1333 
1334     /* if there is no variant tag after a '-' or '_' then look for '@' */
1335     if(!hasVariant) {
1336         if(prev=='@') {
1337             /* keep localeID */
1338         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1339             ++localeID; /* point after the '@' */
1340         } else {
1341             return;
1342         }
1343         while(!_isTerminator(*localeID)) {
1344             if (needSeparator) {
1345                 sink.Append("_", 1);
1346                 needSeparator = FALSE;
1347             }
1348             char c = (char)uprv_toupper(*localeID);
1349             if (c == '-' || c == ',') c = '_';
1350             sink.Append(&c, 1);
1351             localeID++;
1352         }
1353     }
1354 }
1355 
1356 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1357 _getVariantEx(const char *localeID,
1358               char prev,
1359               char *variant, int32_t variantCapacity,
1360               UBool needSeparator) {
1361     CheckedArrayByteSink sink(variant, variantCapacity);
1362     _getVariantEx(localeID, prev, sink, needSeparator);
1363     return sink.NumberOfBytesAppended();
1364 }
1365 
1366 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1367 _getVariant(const char *localeID,
1368             char prev,
1369             char *variant, int32_t variantCapacity) {
1370     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1371 }
1372 
1373 /* Keyword enumeration */
1374 
1375 typedef struct UKeywordsContext {
1376     char* keywords;
1377     char* current;
1378 } UKeywordsContext;
1379 
1380 U_CDECL_BEGIN
1381 
1382 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1383 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1384     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1385     uprv_free(enumerator->context);
1386     uprv_free(enumerator);
1387 }
1388 
1389 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1390 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1391     char *kw = ((UKeywordsContext *)en->context)->keywords;
1392     int32_t result = 0;
1393     while(*kw) {
1394         result++;
1395         kw += uprv_strlen(kw)+1;
1396     }
1397     return result;
1398 }
1399 
1400 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1401 uloc_kw_nextKeyword(UEnumeration* en,
1402                     int32_t* resultLength,
1403                     UErrorCode* /*status*/) {
1404     const char* result = ((UKeywordsContext *)en->context)->current;
1405     int32_t len = 0;
1406     if(*result) {
1407         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1408         ((UKeywordsContext *)en->context)->current += len+1;
1409     } else {
1410         result = NULL;
1411     }
1412     if (resultLength) {
1413         *resultLength = len;
1414     }
1415     return result;
1416 }
1417 
1418 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1419 uloc_kw_resetKeywords(UEnumeration* en,
1420                       UErrorCode* /*status*/) {
1421     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1422 }
1423 
1424 U_CDECL_END
1425 
1426 
1427 static const UEnumeration gKeywordsEnum = {
1428     NULL,
1429     NULL,
1430     uloc_kw_closeKeywords,
1431     uloc_kw_countKeywords,
1432     uenum_unextDefault,
1433     uloc_kw_nextKeyword,
1434     uloc_kw_resetKeywords
1435 };
1436 
1437 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1438 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1439 {
1440     LocalMemory<UKeywordsContext> myContext;
1441     LocalMemory<UEnumeration> result;
1442 
1443     if (U_FAILURE(*status)) {
1444         return nullptr;
1445     }
1446     myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1447     result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1448     if (myContext.isNull() || result.isNull()) {
1449         *status = U_MEMORY_ALLOCATION_ERROR;
1450         return nullptr;
1451     }
1452     uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1453     myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1454     if (myContext->keywords == nullptr) {
1455         *status = U_MEMORY_ALLOCATION_ERROR;
1456         return nullptr;
1457     }
1458     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1459     myContext->keywords[keywordListSize] = 0;
1460     myContext->current = myContext->keywords;
1461     result->context = myContext.orphan();
1462     return result.orphan();
1463 }
1464 
1465 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1466 uloc_openKeywords(const char* localeID,
1467                         UErrorCode* status)
1468 {
1469     int32_t i=0;
1470     char keywords[256];
1471     int32_t keywordsCapacity = 256;
1472     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1473     const char* tmpLocaleID;
1474 
1475     if(status==NULL || U_FAILURE(*status)) {
1476         return 0;
1477     }
1478 
1479     if (_hasBCP47Extension(localeID)) {
1480         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1481     } else {
1482         if (localeID==NULL) {
1483            localeID=uloc_getDefault();
1484         }
1485         tmpLocaleID=localeID;
1486     }
1487 
1488     /* Skip the language */
1489     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1490     if(_isIDSeparator(*tmpLocaleID)) {
1491         const char *scriptID;
1492         /* Skip the script if available */
1493         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1494         if(scriptID != tmpLocaleID+1) {
1495             /* Found optional script */
1496             tmpLocaleID = scriptID;
1497         }
1498         /* Skip the Country */
1499         if (_isIDSeparator(*tmpLocaleID)) {
1500             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1501             if(_isIDSeparator(*tmpLocaleID)) {
1502                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1503             }
1504         }
1505     }
1506 
1507     /* keywords are located after '@' */
1508     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1509         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, FALSE, status);
1510     }
1511 
1512     if(i) {
1513         return uloc_openKeywordList(keywords, i, status);
1514     } else {
1515         return NULL;
1516     }
1517 }
1518 
1519 
1520 /* bit-flags for 'options' parameter of _canonicalize */
1521 #define _ULOC_STRIP_KEYWORDS 0x2
1522 #define _ULOC_CANONICALIZE   0x1
1523 
1524 #define OPTION_SET(options, mask) ((options & mask) != 0)
1525 
1526 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1527 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1528 
1529 /**
1530  * Canonicalize the given localeID, to level 1 or to level 2,
1531  * depending on the options.  To specify level 1, pass in options=0.
1532  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1533  *
1534  * This is the code underlying uloc_getName and uloc_canonicalize.
1535  */
1536 static void
_canonicalize(const char * localeID,ByteSink & sink,uint32_t options,UErrorCode * err)1537 _canonicalize(const char* localeID,
1538               ByteSink& sink,
1539               uint32_t options,
1540               UErrorCode* err) {
1541     int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
1542     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1543     const char* origLocaleID;
1544     const char* tmpLocaleID;
1545     const char* keywordAssign = NULL;
1546     const char* separatorIndicator = NULL;
1547 
1548     if (U_FAILURE(*err)) {
1549         return;
1550     }
1551 
1552     if (_hasBCP47Extension(localeID)) {
1553         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1554     } else {
1555         if (localeID==NULL) {
1556            localeID=uloc_getDefault();
1557         }
1558         tmpLocaleID=localeID;
1559     }
1560 
1561     origLocaleID=tmpLocaleID;
1562 
1563     /* get all pieces, one after another, and separate with '_' */
1564     CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1565 
1566     if (tag.length() == I_DEFAULT_LENGTH &&
1567             uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1568         tag.clear();
1569         tag.append(uloc_getDefault(), *err);
1570     } else if(_isIDSeparator(*tmpLocaleID)) {
1571         const char *scriptID;
1572 
1573         ++fieldCount;
1574         tag.append('_', *err);
1575 
1576         CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1577         tag.append(script, *err);
1578         scriptSize = script.length();
1579         if(scriptSize > 0) {
1580             /* Found optional script */
1581             tmpLocaleID = scriptID;
1582             ++fieldCount;
1583             if (_isIDSeparator(*tmpLocaleID)) {
1584                 /* If there is something else, then we add the _ */
1585                 tag.append('_', *err);
1586             }
1587         }
1588 
1589         if (_isIDSeparator(*tmpLocaleID)) {
1590             const char *cntryID;
1591 
1592             CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1593             tag.append(country, *err);
1594             if (!country.isEmpty()) {
1595                 /* Found optional country */
1596                 tmpLocaleID = cntryID;
1597             }
1598             if(_isIDSeparator(*tmpLocaleID)) {
1599                 /* If there is something else, then we add the _  if we found country before. */
1600                 if (!_isIDSeparator(*(tmpLocaleID+1))) {
1601                     ++fieldCount;
1602                     tag.append('_', *err);
1603                 }
1604 
1605                 variantSize = -tag.length();
1606                 {
1607                     CharStringByteSink s(&tag);
1608                     _getVariantEx(tmpLocaleID+1, *tmpLocaleID, s, FALSE);
1609                 }
1610                 variantSize += tag.length();
1611                 if (variantSize > 0) {
1612                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1613                 }
1614             }
1615         }
1616     }
1617 
1618     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1619     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1620         UBool done = FALSE;
1621         do {
1622             char c = *tmpLocaleID;
1623             switch (c) {
1624             case 0:
1625             case '@':
1626                 done = TRUE;
1627                 break;
1628             default:
1629                 tag.append(c, *err);
1630                 ++tmpLocaleID;
1631                 break;
1632             }
1633         } while (!done);
1634     }
1635 
1636     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1637        After this, tmpLocaleID either points to '@' or is NULL */
1638     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1639         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1640         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1641     }
1642 
1643     /* Copy POSIX-style variant, if any [mr@FOO] */
1644     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1645         tmpLocaleID != NULL && keywordAssign == NULL) {
1646         for (;;) {
1647             char c = *tmpLocaleID;
1648             if (c == 0) {
1649                 break;
1650             }
1651             tag.append(c, *err);
1652             ++tmpLocaleID;
1653         }
1654     }
1655 
1656     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1657         /* Handle @FOO variant if @ is present and not followed by = */
1658         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1659             /* Add missing '_' if needed */
1660             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1661                 do {
1662                     tag.append('_', *err);
1663                     ++fieldCount;
1664                 } while(fieldCount<2);
1665             }
1666 
1667             int32_t posixVariantSize = -tag.length();
1668             {
1669                 CharStringByteSink s(&tag);
1670                 _getVariantEx(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
1671             }
1672             posixVariantSize += tag.length();
1673             if (posixVariantSize > 0) {
1674                 variantSize += posixVariantSize;
1675             }
1676         }
1677 
1678         /* Look up the ID in the canonicalization map */
1679         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1680             StringPiece id(CANONICALIZE_MAP[j].id);
1681             if (tag == id) {
1682                 if (id.empty() && tmpLocaleID != NULL) {
1683                     break; /* Don't remap "" if keywords present */
1684                 }
1685                 tag.clear();
1686                 tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1687                 break;
1688             }
1689         }
1690     }
1691 
1692     sink.Append(tag.data(), tag.length());
1693 
1694     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1695         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1696             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1697             sink.Append("@", 1);
1698             ++fieldCount;
1699             _getKeywords(tmpLocaleID+1, '@', sink, TRUE, err);
1700         }
1701     }
1702 }
1703 
1704 /* ### ID parsing API **************************************************/
1705 
1706 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1707 uloc_getParent(const char*    localeID,
1708                char* parent,
1709                int32_t parentCapacity,
1710                UErrorCode* err)
1711 {
1712     const char *lastUnderscore;
1713     int32_t i;
1714 
1715     if (U_FAILURE(*err))
1716         return 0;
1717 
1718     if (localeID == NULL)
1719         localeID = uloc_getDefault();
1720 
1721     lastUnderscore=uprv_strrchr(localeID, '_');
1722     if(lastUnderscore!=NULL) {
1723         i=(int32_t)(lastUnderscore-localeID);
1724     } else {
1725         i=0;
1726     }
1727 
1728     if (i > 0) {
1729         if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1730             localeID += 3;
1731             i -= 3;
1732             uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1733         } else if (parent != localeID) {
1734             uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1735         }
1736     }
1737 
1738     return u_terminateChars(parent, parentCapacity, i, err);
1739 }
1740 
1741 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1742 uloc_getLanguage(const char*    localeID,
1743          char* language,
1744          int32_t languageCapacity,
1745          UErrorCode* err)
1746 {
1747     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1748     int32_t i=0;
1749 
1750     if (err==NULL || U_FAILURE(*err)) {
1751         return 0;
1752     }
1753 
1754     if(localeID==NULL) {
1755         localeID=uloc_getDefault();
1756     }
1757 
1758     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1759     return u_terminateChars(language, languageCapacity, i, err);
1760 }
1761 
1762 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1763 uloc_getScript(const char*    localeID,
1764          char* script,
1765          int32_t scriptCapacity,
1766          UErrorCode* err)
1767 {
1768     int32_t i=0;
1769 
1770     if(err==NULL || U_FAILURE(*err)) {
1771         return 0;
1772     }
1773 
1774     if(localeID==NULL) {
1775         localeID=uloc_getDefault();
1776     }
1777 
1778     /* skip the language */
1779     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1780     if(_isIDSeparator(*localeID)) {
1781         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1782     }
1783     return u_terminateChars(script, scriptCapacity, i, err);
1784 }
1785 
1786 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1787 uloc_getCountry(const char* localeID,
1788             char* country,
1789             int32_t countryCapacity,
1790             UErrorCode* err)
1791 {
1792     int32_t i=0;
1793 
1794     if(err==NULL || U_FAILURE(*err)) {
1795         return 0;
1796     }
1797 
1798     if(localeID==NULL) {
1799         localeID=uloc_getDefault();
1800     }
1801 
1802     /* Skip the language */
1803     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1804     if(_isIDSeparator(*localeID)) {
1805         const char *scriptID;
1806         /* Skip the script if available */
1807         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1808         if(scriptID != localeID+1) {
1809             /* Found optional script */
1810             localeID = scriptID;
1811         }
1812         if(_isIDSeparator(*localeID)) {
1813             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1814         }
1815     }
1816     return u_terminateChars(country, countryCapacity, i, err);
1817 }
1818 
1819 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1820 uloc_getVariant(const char* localeID,
1821                 char* variant,
1822                 int32_t variantCapacity,
1823                 UErrorCode* err)
1824 {
1825     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1826     const char* tmpLocaleID;
1827     int32_t i=0;
1828 
1829     if(err==NULL || U_FAILURE(*err)) {
1830         return 0;
1831     }
1832 
1833     if (_hasBCP47Extension(localeID)) {
1834         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1835     } else {
1836         if (localeID==NULL) {
1837            localeID=uloc_getDefault();
1838         }
1839         tmpLocaleID=localeID;
1840     }
1841 
1842     /* Skip the language */
1843     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1844     if(_isIDSeparator(*tmpLocaleID)) {
1845         const char *scriptID;
1846         /* Skip the script if available */
1847         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1848         if(scriptID != tmpLocaleID+1) {
1849             /* Found optional script */
1850             tmpLocaleID = scriptID;
1851         }
1852         /* Skip the Country */
1853         if (_isIDSeparator(*tmpLocaleID)) {
1854             const char *cntryID;
1855             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
1856             if (cntryID != tmpLocaleID+1) {
1857                 /* Found optional country */
1858                 tmpLocaleID = cntryID;
1859             }
1860             if(_isIDSeparator(*tmpLocaleID)) {
1861                 /* If there was no country ID, skip a possible extra IDSeparator */
1862                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1863                     tmpLocaleID++;
1864                 }
1865                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
1866             }
1867         }
1868     }
1869 
1870     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1871     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1872 /*
1873     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1874         i=_getVariant(localeID+1, '@', variant, variantCapacity);
1875     }
1876 */
1877     return u_terminateChars(variant, variantCapacity, i, err);
1878 }
1879 
1880 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1881 uloc_getName(const char* localeID,
1882              char* name,
1883              int32_t nameCapacity,
1884              UErrorCode* err)
1885 {
1886     if (U_FAILURE(*err)) {
1887         return 0;
1888     }
1889 
1890     CheckedArrayByteSink sink(name, nameCapacity);
1891     ulocimp_getName(localeID, sink, err);
1892 
1893     int32_t reslen = sink.NumberOfBytesAppended();
1894 
1895     if (U_FAILURE(*err)) {
1896         return reslen;
1897     }
1898 
1899     if (sink.Overflowed()) {
1900         *err = U_BUFFER_OVERFLOW_ERROR;
1901     } else {
1902         u_terminateChars(name, nameCapacity, reslen, err);
1903     }
1904 
1905     return reslen;
1906 }
1907 
1908 U_STABLE void U_EXPORT2
ulocimp_getName(const char * localeID,ByteSink & sink,UErrorCode * err)1909 ulocimp_getName(const char* localeID,
1910                 ByteSink& sink,
1911                 UErrorCode* err)
1912 {
1913     _canonicalize(localeID, sink, 0, err);
1914 }
1915 
1916 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1917 uloc_getBaseName(const char* localeID,
1918                  char* name,
1919                  int32_t nameCapacity,
1920                  UErrorCode* err)
1921 {
1922     if (U_FAILURE(*err)) {
1923         return 0;
1924     }
1925 
1926     CheckedArrayByteSink sink(name, nameCapacity);
1927     ulocimp_getBaseName(localeID, sink, err);
1928 
1929     int32_t reslen = sink.NumberOfBytesAppended();
1930 
1931     if (U_FAILURE(*err)) {
1932         return reslen;
1933     }
1934 
1935     if (sink.Overflowed()) {
1936         *err = U_BUFFER_OVERFLOW_ERROR;
1937     } else {
1938         u_terminateChars(name, nameCapacity, reslen, err);
1939     }
1940 
1941     return reslen;
1942 }
1943 
1944 U_STABLE void U_EXPORT2
ulocimp_getBaseName(const char * localeID,ByteSink & sink,UErrorCode * err)1945 ulocimp_getBaseName(const char* localeID,
1946                     ByteSink& sink,
1947                     UErrorCode* err)
1948 {
1949     _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1950 }
1951 
1952 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1953 uloc_canonicalize(const char* localeID,
1954                   char* name,
1955                   int32_t nameCapacity,
1956                   UErrorCode* err)
1957 {
1958     if (U_FAILURE(*err)) {
1959         return 0;
1960     }
1961 
1962     CheckedArrayByteSink sink(name, nameCapacity);
1963     ulocimp_canonicalize(localeID, sink, err);
1964 
1965     int32_t reslen = sink.NumberOfBytesAppended();
1966 
1967     if (U_FAILURE(*err)) {
1968         return reslen;
1969     }
1970 
1971     if (sink.Overflowed()) {
1972         *err = U_BUFFER_OVERFLOW_ERROR;
1973     } else {
1974         u_terminateChars(name, nameCapacity, reslen, err);
1975     }
1976 
1977     return reslen;
1978 }
1979 
1980 U_STABLE void U_EXPORT2
ulocimp_canonicalize(const char * localeID,ByteSink & sink,UErrorCode * err)1981 ulocimp_canonicalize(const char* localeID,
1982                      ByteSink& sink,
1983                      UErrorCode* err)
1984 {
1985     _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1986 }
1987 
1988 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)1989 uloc_getISO3Language(const char* localeID)
1990 {
1991     int16_t offset;
1992     char lang[ULOC_LANG_CAPACITY];
1993     UErrorCode err = U_ZERO_ERROR;
1994 
1995     if (localeID == NULL)
1996     {
1997         localeID = uloc_getDefault();
1998     }
1999     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2000     if (U_FAILURE(err))
2001         return "";
2002     offset = _findIndex(LANGUAGES, lang);
2003     if (offset < 0)
2004         return "";
2005     return LANGUAGES_3[offset];
2006 }
2007 
2008 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)2009 uloc_getISO3Country(const char* localeID)
2010 {
2011     int16_t offset;
2012     char cntry[ULOC_LANG_CAPACITY];
2013     UErrorCode err = U_ZERO_ERROR;
2014 
2015     if (localeID == NULL)
2016     {
2017         localeID = uloc_getDefault();
2018     }
2019     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2020     if (U_FAILURE(err))
2021         return "";
2022     offset = _findIndex(COUNTRIES, cntry);
2023     if (offset < 0)
2024         return "";
2025 
2026     return COUNTRIES_3[offset];
2027 }
2028 
2029 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2030 uloc_getLCID(const char* localeID)
2031 {
2032     UErrorCode status = U_ZERO_ERROR;
2033     char       langID[ULOC_FULLNAME_CAPACITY];
2034     uint32_t   lcid = 0;
2035 
2036     /* Check for incomplete id. */
2037     if (!localeID || uprv_strlen(localeID) < 2) {
2038         return 0;
2039     }
2040 
2041     // First, attempt Windows platform lookup if available, but fall
2042     // through to catch any special cases (ICU vs Windows name differences).
2043     lcid = uprv_convertToLCIDPlatform(localeID, &status);
2044     if (U_FAILURE(status)) {
2045         return 0;
2046     }
2047     if (lcid > 0) {
2048         // Windows found an LCID, return that
2049         return lcid;
2050     }
2051 
2052     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2053     if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2054         return 0;
2055     }
2056 
2057     if (uprv_strchr(localeID, '@')) {
2058         // uprv_convertToLCID does not support keywords other than collation.
2059         // Remove all keywords except collation.
2060         int32_t len;
2061         char collVal[ULOC_KEYWORDS_CAPACITY];
2062         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2063 
2064         len = uloc_getKeywordValue(localeID, "collation", collVal,
2065             UPRV_LENGTHOF(collVal) - 1, &status);
2066 
2067         if (U_SUCCESS(status) && len > 0) {
2068             collVal[len] = 0;
2069 
2070             len = uloc_getBaseName(localeID, tmpLocaleID,
2071                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2072 
2073             if (U_SUCCESS(status) && len > 0) {
2074                 tmpLocaleID[len] = 0;
2075 
2076                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2077                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2078 
2079                 if (U_SUCCESS(status) && len > 0) {
2080                     tmpLocaleID[len] = 0;
2081                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2082                 }
2083             }
2084         }
2085 
2086         // fall through - all keywords are simply ignored
2087         status = U_ZERO_ERROR;
2088     }
2089 
2090     return uprv_convertToLCID(langID, localeID, &status);
2091 }
2092 
2093 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2094 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2095                 UErrorCode *status)
2096 {
2097     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2098 }
2099 
2100 /* ### Default locale **************************************************/
2101 
2102 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2103 uloc_getDefault()
2104 {
2105     return locale_get_default();
2106 }
2107 
2108 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2109 uloc_setDefault(const char*   newDefaultLocale,
2110              UErrorCode* err)
2111 {
2112     if (U_FAILURE(*err))
2113         return;
2114     /* the error code isn't currently used for anything by this function*/
2115 
2116     /* propagate change to C++ */
2117     locale_set_default(newDefaultLocale);
2118 }
2119 
2120 /**
2121  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2122  * to an array of pointers to arrays of char.  All of these pointers are owned
2123  * by ICU-- do not delete them, and do not write through them.  The array is
2124  * terminated with a null pointer.
2125  */
2126 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2127 uloc_getISOLanguages()
2128 {
2129     return LANGUAGES;
2130 }
2131 
2132 /**
2133  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2134  * pointer to an array of pointers to arrays of char.  All of these pointers are
2135  * owned by ICU-- do not delete them, and do not write through them.  The array is
2136  * terminated with a null pointer.
2137  */
2138 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2139 uloc_getISOCountries()
2140 {
2141     return COUNTRIES;
2142 }
2143 
2144 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2145 uloc_toUnicodeLocaleKey(const char* keyword)
2146 {
2147     const char* bcpKey = ulocimp_toBcpKey(keyword);
2148     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2149         // unknown keyword, but syntax is fine..
2150         return keyword;
2151     }
2152     return bcpKey;
2153 }
2154 
2155 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2156 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2157 {
2158     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2159     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2160         // unknown keyword, but syntax is fine..
2161         return value;
2162     }
2163     return bcpType;
2164 }
2165 
2166 static UBool
isWellFormedLegacyKey(const char * legacyKey)2167 isWellFormedLegacyKey(const char* legacyKey)
2168 {
2169     const char* p = legacyKey;
2170     while (*p) {
2171         if (!UPRV_ISALPHANUM(*p)) {
2172             return FALSE;
2173         }
2174         p++;
2175     }
2176     return TRUE;
2177 }
2178 
2179 static UBool
isWellFormedLegacyType(const char * legacyType)2180 isWellFormedLegacyType(const char* legacyType)
2181 {
2182     const char* p = legacyType;
2183     int32_t alphaNumLen = 0;
2184     while (*p) {
2185         if (*p == '_' || *p == '/' || *p == '-') {
2186             if (alphaNumLen == 0) {
2187                 return FALSE;
2188             }
2189             alphaNumLen = 0;
2190         } else if (UPRV_ISALPHANUM(*p)) {
2191             alphaNumLen++;
2192         } else {
2193             return FALSE;
2194         }
2195         p++;
2196     }
2197     return (alphaNumLen != 0);
2198 }
2199 
2200 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2201 uloc_toLegacyKey(const char* keyword)
2202 {
2203     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2204     if (legacyKey == NULL) {
2205         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2206         //
2207         // Note:
2208         //  LDML/CLDR provides some definition of keyword syntax in
2209         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2210         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2211         //  Keys can only consist of [0-9a-zA-Z].
2212         if (isWellFormedLegacyKey(keyword)) {
2213             return keyword;
2214         }
2215     }
2216     return legacyKey;
2217 }
2218 
2219 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2220 uloc_toLegacyType(const char* keyword, const char* value)
2221 {
2222     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2223     if (legacyType == NULL) {
2224         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2225         //
2226         // Note:
2227         //  LDML/CLDR provides some definition of keyword syntax in
2228         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2229         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2230         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2231         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2232         if (isWellFormedLegacyType(value)) {
2233             return value;
2234         }
2235     }
2236     return legacyType;
2237 }
2238 
2239 /*eof*/
2240