• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1997-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   04/01/97    aliu        Creation.
15 *   08/21/98    stephen     JDK 1.2 sync
16 *   12/08/98    rtg         New Locale implementation and C API
17 *   03/15/99    damiba      overhaul.
18 *   04/06/99    stephen     changed setDefault() to realloc and copy
19 *   06/14/99    stephen     Changed calls to ures_open for new params
20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22 *                           brought canonicalization code into line with spec
23 *****************************************************************************/
24 
25 /*
26    POSIX's locale format, from putil.c: [no spaces]
27 
28      ll [ _CC ] [ . MM ] [ @ VV]
29 
30      l = lang, C = ctry, M = charmap, V = variant
31 */
32 
33 #include "unicode/bytestream.h"
34 #include "unicode/errorcode.h"
35 #include "unicode/stringpiece.h"
36 #include "unicode/utypes.h"
37 #include "unicode/ustring.h"
38 #include "unicode/uloc.h"
39 
40 #include "bytesinkutil.h"
41 #include "putilimp.h"
42 #include "ustr_imp.h"
43 #include "ulocimp.h"
44 #include "umutex.h"
45 #include "cstring.h"
46 #include "cmemory.h"
47 #include "locmap.h"
48 #include "uarrsort.h"
49 #include "uenumimp.h"
50 #include "uassert.h"
51 #include "charstr.h"
52 
53 U_NAMESPACE_USE
54 
55 /* ### Declarations **************************************************/
56 
57 /* Locale stuff from locid.cpp */
58 U_CFUNC void locale_set_default(const char *id);
59 U_CFUNC const char *locale_get_default(void);
60 
61 /* ### Data tables **************************************************/
62 
63 /**
64  * Table of language codes, both 2- and 3-letter, with preference
65  * given to 2-letter codes where possible.  Includes 3-letter codes
66  * that lack a 2-letter equivalent.
67  *
68  * This list must be in sorted order.  This list is returned directly
69  * to the user by some API.
70  *
71  * This list must be kept in sync with LANGUAGES_3, with corresponding
72  * entries matched.
73  *
74  * This table should be terminated with a NULL entry, followed by a
75  * second list, and another NULL entry.  The first list is visible to
76  * user code when this array is returned by API.  The second list
77  * contains codes we support, but do not expose through user API.
78  *
79  * Notes
80  *
81  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
82  * include the revisions up to 2001/7/27 *CWB*
83  *
84  * The 3 character codes are the terminology codes like RFC 3066.  This
85  * is compatible with prior ICU codes
86  *
87  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
88  * table but now at the end of the table because 3 character codes are
89  * duplicates.  This avoids bad searches going from 3 to 2 character
90  * codes.
91  *
92  * The range qaa-qtz is reserved for local use
93  */
94 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
95 /* ISO639 table version is 20150505 */
96 /* Subsequent hand addition of selected languages */
97 static const char * const LANGUAGES[] = {
98     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
99     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
100     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
101     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
102     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
103     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
104     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
105     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
106     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
107     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
108     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
109     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
110     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
111     "cs",  "csb", "cu",  "cv",  "cy",
112     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
113     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
114     "dyo", "dyu", "dz",  "dzg",
115     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
116     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
117     "ext",
118     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
119     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
120     "frs", "fur", "fy",
121     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
122     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
123     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
124     "gur", "guz", "gv",  "gwi",
125     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
126     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
127     "hup", "hy",  "hz",
128     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
129     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
130     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
131     "jv",
132     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
133     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
134     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
135     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
136     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
137     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
138     "kv",  "kw",  "ky",
139     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
140     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
141     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
142     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
143     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
144     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
145     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
146     "ml",  "mn",  "mnc", "mni",
147     "moh", "mos", "mr",  "mrj",
148     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
149     "my",  "mye", "myv", "mzn",
150     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
151     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
152     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
153     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
154     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
155     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
156     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
157     "pon", "prg", "pro", "ps",  "pt",
158     "qu",  "quc", "qug",
159     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
160     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
161     "rw",  "rwk",
162     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
163     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
164     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
165     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
166     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
167     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
168     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
169     "sv",  "sw",  "swb", "syc", "syr", "szl",
170     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
171     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
172     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
173     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
174     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
175     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
176     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
177     "vot", "vro", "vun",
178     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
179     "xal", "xh",  "xmf", "xog",
180     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
181     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
182     "zun", "zxx", "zza",
183 NULL,
184     "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
185 NULL
186 };
187 
188 static const char* const DEPRECATED_LANGUAGES[]={
189     "in", "iw", "ji", "jw", NULL, NULL
190 };
191 static const char* const REPLACEMENT_LANGUAGES[]={
192     "id", "he", "yi", "jv", NULL, NULL
193 };
194 
195 /**
196  * Table of 3-letter language codes.
197  *
198  * This is a lookup table used to convert 3-letter language codes to
199  * their 2-letter equivalent, where possible.  It must be kept in sync
200  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
201  * same language as LANGUAGES_3[i].  The commented-out lines are
202  * copied from LANGUAGES to make eyeballing this baby easier.
203  *
204  * Where a 3-letter language code has no 2-letter equivalent, the
205  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
206  *
207  * This table should be terminated with a NULL entry, followed by a
208  * second list, and another NULL entry.  The two lists correspond to
209  * the two lists in LANGUAGES.
210  */
211 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
212 /* ISO639 table version is 20150505 */
213 /* Subsequent hand addition of selected languages */
214 static const char * const LANGUAGES_3[] = {
215     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
216     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
217     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
218     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
219     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
220     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
221     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
222     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
223     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
224     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
225     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
226     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
227     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
228     "ces", "csb", "chu", "chv", "cym",
229     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
230     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
231     "dyo", "dyu", "dzo", "dzg",
232     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
233     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
234     "ext",
235     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
236     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
237     "frs", "fur", "fry",
238     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
239     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
240     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
241     "gur", "guz", "glv", "gwi",
242     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
243     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
244     "hup", "hye", "her",
245     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
246     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
247     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
248     "jav",
249     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
250     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
251     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
252     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
253     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
254     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
255     "kom", "cor", "kir",
256     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
257     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
258     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
259     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
260     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
261     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
262     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
263     "mal", "mon", "mnc", "mni",
264     "moh", "mos", "mar", "mrj",
265     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266     "mya", "mye", "myv", "mzn",
267     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
273     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274     "pon", "prg", "pro", "pus", "por",
275     "que", "quc", "qug",
276     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278     "kin", "rwk",
279     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286     "swe", "swa", "swb", "syc", "syr", "szl",
287     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
289     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
290     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
294     "vot", "vro", "vun",
295     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296     "xal", "xho", "xmf", "xog",
297     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299     "zun", "zxx", "zza",
300 NULL,
301 /*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
302     "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
303 NULL
304 };
305 
306 /**
307  * Table of 2-letter country codes.
308  *
309  * This list must be in sorted order.  This list is returned directly
310  * to the user by some API.
311  *
312  * This list must be kept in sync with COUNTRIES_3, with corresponding
313  * entries matched.
314  *
315  * This table should be terminated with a NULL entry, followed by a
316  * second list, and another NULL entry.  The first list is visible to
317  * user code when this array is returned by API.  The second list
318  * contains codes we support, but do not expose through user API.
319  *
320  * Notes:
321  *
322  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324  * new codes keeping the old ones for compatibility updated to include
325  * 1999/12/03 revisions *CWB*
326  *
327  * RO(ROM) is now RO(ROU) according to
328  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329  */
330 static const char * const COUNTRIES[] = {
331     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
332     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
333     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
334     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
335     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
336     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
337     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
338     "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
339     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
340     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
341     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
342     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
343     "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
344     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
345     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
346     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
347     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
348     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
349     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
350     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
351     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
352     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
353     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
354     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
355     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
356     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
357     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
358     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
359     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
360     "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
361 NULL,
362     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
363 NULL
364 };
365 
366 static const char* const DEPRECATED_COUNTRIES[] = {
367     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
368 };
369 static const char* const REPLACEMENT_COUNTRIES[] = {
370 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
372 };
373 
374 /**
375  * Table of 3-letter country codes.
376  *
377  * This is a lookup table used to convert 3-letter country codes to
378  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
379  * For all valid i, COUNTRIES[i] must refer to the same country as
380  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
381  * to make eyeballing this baby easier.
382  *
383  * This table should be terminated with a NULL entry, followed by a
384  * second list, and another NULL entry.  The two lists correspond to
385  * the two lists in COUNTRIES.
386  */
387 static const char * const COUNTRIES_3[] = {
388 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
389     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
391     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
393     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
395     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
397     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
399     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
400 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
401     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
402 /*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
403     "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
404 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
405     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
407     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
409     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
411     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412 /*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
413     "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
415     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
417     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
419     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
421     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
423     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
425     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
427     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
429     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
431     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
433     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
435     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
437     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
439     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
441     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
443     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
445     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
447     "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448 NULL,
449 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
450     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451 NULL
452 };
453 
454 typedef struct CanonicalizationMap {
455     const char *id;          /* input ID */
456     const char *canonicalID; /* canonicalized output ID */
457 } CanonicalizationMap;
458 
459 /**
460  * A map to canonicalize locale IDs.  This handles a variety of
461  * different semantic kinds of transformations.
462  */
463 static const CanonicalizationMap CANONICALIZE_MAP[] = {
464     { "art__LOJBAN",    "jbo" }, /* registered name */
465     { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
466     { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
467     { "zh__GUOYU",      "zh" }, /* registered name */
468     { "zh__HAKKA",      "hak" }, /* registered name */
469     { "zh__XIANG",      "hsn" }, /* registered name */
470     // subtags with 3 chars won't be treated as variants.
471     { "zh_GAN",         "gan" }, /* registered name */
472     { "zh_MIN_NAN",     "nan" }, /* registered name */
473     { "zh_WUU",         "wuu" }, /* registered name */
474     { "zh_YUE",         "yue" }, /* registered name */
475 };
476 
477 /* ### BCP47 Conversion *******************************************/
478 /* Test if the locale id has BCP47 u extension and does not have '@' */
479 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
480 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
_ConvertBCP47(const char * id,char * buffer,int32_t length,UErrorCode * err,int32_t * pLocaleIdSize)481 static const char* _ConvertBCP47(
482         const char* id, char* buffer, int32_t length,
483         UErrorCode* err, int32_t* pLocaleIdSize) {
484     const char* finalID;
485     int32_t localeIDSize = uloc_forLanguageTag(id, buffer, length, NULL, err);
486     if (localeIDSize <= 0 || U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) {
487         finalID=id;
488         if (*err == U_STRING_NOT_TERMINATED_WARNING) {
489             *err = U_BUFFER_OVERFLOW_ERROR;
490         }
491     } else {
492         finalID=buffer;
493     }
494     if (pLocaleIdSize != nullptr) {
495         *pLocaleIdSize = localeIDSize;
496     }
497     return finalID;
498 }
499 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)500 static int32_t getShortestSubtagLength(const char *localeID) {
501     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
502     int32_t length = localeIDLength;
503     int32_t tmpLength = 0;
504     int32_t i;
505     UBool reset = TRUE;
506 
507     for (i = 0; i < localeIDLength; i++) {
508         if (localeID[i] != '_' && localeID[i] != '-') {
509             if (reset) {
510                 tmpLength = 0;
511                 reset = FALSE;
512             }
513             tmpLength++;
514         } else {
515             if (tmpLength != 0 && tmpLength < length) {
516                 length = tmpLength;
517             }
518             reset = TRUE;
519         }
520     }
521 
522     return length;
523 }
524 
525 /* ### Keywords **************************************************/
526 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
527 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
528 /* Punctuation/symbols allowed in legacy key values */
529 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
530 
531 #define ULOC_KEYWORD_BUFFER_LEN 25
532 #define ULOC_MAX_NO_KEYWORDS 25
533 
534 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)535 locale_getKeywordsStart(const char *localeID) {
536     const char *result = NULL;
537     if((result = uprv_strchr(localeID, '@')) != NULL) {
538         return result;
539     }
540 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
541     else {
542         /* We do this because the @ sign is variant, and the @ sign used on one
543         EBCDIC machine won't be compiled the same way on other EBCDIC based
544         machines. */
545         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
546         const uint8_t *charToFind = ebcdicSigns;
547         while(*charToFind) {
548             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
549                 return result;
550             }
551             charToFind++;
552         }
553     }
554 #endif
555     return NULL;
556 }
557 
558 /**
559  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
560  * @param keywordName incoming name to be canonicalized
561  * @param status return status (keyword too long)
562  * @return length of the keyword name
563  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)564 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
565 {
566   int32_t keywordNameLen = 0;
567 
568   for (; *keywordName != 0; keywordName++) {
569     if (!UPRV_ISALPHANUM(*keywordName)) {
570       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
571       return 0;
572     }
573     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
574       buf[keywordNameLen++] = uprv_tolower(*keywordName);
575     } else {
576       /* keyword name too long for internal buffer */
577       *status = U_INTERNAL_PROGRAM_ERROR;
578       return 0;
579     }
580   }
581   if (keywordNameLen == 0) {
582     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
583     return 0;
584   }
585   buf[keywordNameLen] = 0; /* terminate */
586 
587   return keywordNameLen;
588 }
589 
590 typedef struct {
591     char keyword[ULOC_KEYWORD_BUFFER_LEN];
592     int32_t keywordLen;
593     const char *valueStart;
594     int32_t valueLen;
595 } KeywordStruct;
596 
597 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)598 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
599     const char* leftString = ((const KeywordStruct *)left)->keyword;
600     const char* rightString = ((const KeywordStruct *)right)->keyword;
601     return uprv_strcmp(leftString, rightString);
602 }
603 
604 U_CFUNC void
ulocimp_getKeywords(const char * localeID,char prev,ByteSink & sink,UBool valuesToo,UErrorCode * status)605 ulocimp_getKeywords(const char *localeID,
606                     char prev,
607                     ByteSink& sink,
608                     UBool valuesToo,
609                     UErrorCode *status)
610 {
611     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
612 
613     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
614     int32_t numKeywords = 0;
615     const char* pos = localeID;
616     const char* equalSign = NULL;
617     const char* semicolon = NULL;
618     int32_t i = 0, j, n;
619 
620     if(prev == '@') { /* start of keyword definition */
621         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
622         do {
623             UBool duplicate = FALSE;
624             /* skip leading spaces */
625             while(*pos == ' ') {
626                 pos++;
627             }
628             if (!*pos) { /* handle trailing "; " */
629                 break;
630             }
631             if(numKeywords == maxKeywords) {
632                 *status = U_INTERNAL_PROGRAM_ERROR;
633                 return;
634             }
635             equalSign = uprv_strchr(pos, '=');
636             semicolon = uprv_strchr(pos, ';');
637             /* lack of '=' [foo@currency] is illegal */
638             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
639             if(!equalSign || (semicolon && semicolon<equalSign)) {
640                 *status = U_INVALID_FORMAT_ERROR;
641                 return;
642             }
643             /* need to normalize both keyword and keyword name */
644             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
645                 /* keyword name too long for internal buffer */
646                 *status = U_INTERNAL_PROGRAM_ERROR;
647                 return;
648             }
649             for(i = 0, n = 0; i < equalSign - pos; ++i) {
650                 if (pos[i] != ' ') {
651                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
652                 }
653             }
654 
655             /* zero-length keyword is an error. */
656             if (n == 0) {
657                 *status = U_INVALID_FORMAT_ERROR;
658                 return;
659             }
660 
661             keywordList[numKeywords].keyword[n] = 0;
662             keywordList[numKeywords].keywordLen = n;
663             /* now grab the value part. First we skip the '=' */
664             equalSign++;
665             /* then we leading spaces */
666             while(*equalSign == ' ') {
667                 equalSign++;
668             }
669 
670             /* Premature end or zero-length value */
671             if (!*equalSign || equalSign == semicolon) {
672                 *status = U_INVALID_FORMAT_ERROR;
673                 return;
674             }
675 
676             keywordList[numKeywords].valueStart = equalSign;
677 
678             pos = semicolon;
679             i = 0;
680             if(pos) {
681                 while(*(pos - i - 1) == ' ') {
682                     i++;
683                 }
684                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
685                 pos++;
686             } else {
687                 i = (int32_t)uprv_strlen(equalSign);
688                 while(i && equalSign[i-1] == ' ') {
689                     i--;
690                 }
691                 keywordList[numKeywords].valueLen = i;
692             }
693             /* If this is a duplicate keyword, then ignore it */
694             for (j=0; j<numKeywords; ++j) {
695                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
696                     duplicate = TRUE;
697                     break;
698                 }
699             }
700             if (!duplicate) {
701                 ++numKeywords;
702             }
703         } while(pos);
704 
705         /* now we have a list of keywords */
706         /* we need to sort it */
707         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
708 
709         /* Now construct the keyword part */
710         for(i = 0; i < numKeywords; i++) {
711             sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
712             if(valuesToo) {
713                 sink.Append("=", 1);
714                 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
715                 if(i < numKeywords - 1) {
716                     sink.Append(";", 1);
717                 }
718             } else {
719                 sink.Append("\0", 1);
720             }
721         }
722     }
723 }
724 
725 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)726 uloc_getKeywordValue(const char* localeID,
727                      const char* keywordName,
728                      char* buffer, int32_t bufferCapacity,
729                      UErrorCode* status)
730 {
731     if (U_FAILURE(*status)) {
732         return 0;
733     }
734 
735     CheckedArrayByteSink sink(buffer, bufferCapacity);
736     ulocimp_getKeywordValue(localeID, keywordName, sink, status);
737 
738     int32_t reslen = sink.NumberOfBytesAppended();
739 
740     if (U_FAILURE(*status)) {
741         return reslen;
742     }
743 
744     if (sink.Overflowed()) {
745         *status = U_BUFFER_OVERFLOW_ERROR;
746     } else {
747         u_terminateChars(buffer, bufferCapacity, reslen, status);
748     }
749 
750     return reslen;
751 }
752 
753 U_CAPI void U_EXPORT2
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,icu::ByteSink & sink,UErrorCode * status)754 ulocimp_getKeywordValue(const char* localeID,
755                         const char* keywordName,
756                         icu::ByteSink& sink,
757                         UErrorCode* status)
758 {
759     const char* startSearchHere = NULL;
760     const char* nextSeparator = NULL;
761     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
762     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
763 
764     if(status && U_SUCCESS(*status) && localeID) {
765       char tempBuffer[ULOC_FULLNAME_CAPACITY];
766       const char* tmpLocaleID;
767 
768       if (keywordName == NULL || keywordName[0] == 0) {
769         *status = U_ILLEGAL_ARGUMENT_ERROR;
770         return;
771       }
772 
773       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
774       if(U_FAILURE(*status)) {
775         return;
776       }
777 
778       if (_hasBCP47Extension(localeID)) {
779           tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
780                                       sizeof(tempBuffer), status, nullptr);
781       } else {
782           tmpLocaleID=localeID;
783       }
784 
785       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
786       if(startSearchHere == NULL) {
787           /* no keywords, return at once */
788           return;
789       }
790 
791       /* find the first keyword */
792       while(startSearchHere) {
793           const char* keyValueTail;
794           int32_t keyValueLen;
795 
796           startSearchHere++; /* skip @ or ; */
797           nextSeparator = uprv_strchr(startSearchHere, '=');
798           if(!nextSeparator) {
799               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
800               return;
801           }
802           /* strip leading & trailing spaces (TC decided to tolerate these) */
803           while(*startSearchHere == ' ') {
804               startSearchHere++;
805           }
806           keyValueTail = nextSeparator;
807           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
808               keyValueTail--;
809           }
810           /* now keyValueTail points to first char after the keyName */
811           /* copy & normalize keyName from locale */
812           if (startSearchHere == keyValueTail) {
813               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
814               return;
815           }
816           keyValueLen = 0;
817           while (startSearchHere < keyValueTail) {
818             if (!UPRV_ISALPHANUM(*startSearchHere)) {
819               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
820               return;
821             }
822             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
823               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
824             } else {
825               /* keyword name too long for internal buffer */
826               *status = U_INTERNAL_PROGRAM_ERROR;
827               return;
828             }
829           }
830           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
831 
832           startSearchHere = uprv_strchr(nextSeparator, ';');
833 
834           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
835                /* current entry matches the keyword. */
836              nextSeparator++; /* skip '=' */
837               /* First strip leading & trailing spaces (TC decided to tolerate these) */
838               while(*nextSeparator == ' ') {
839                 nextSeparator++;
840               }
841               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
842               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
843                 keyValueTail--;
844               }
845               /* Now copy the value, but check well-formedness */
846               if (nextSeparator == keyValueTail) {
847                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
848                 return;
849               }
850               while (nextSeparator < keyValueTail) {
851                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
852                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
853                   return;
854                 }
855                 /* Should we lowercase value to return here? Tests expect as-is. */
856                 sink.Append(nextSeparator++, 1);
857               }
858               return;
859           }
860       }
861     }
862 }
863 
864 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)865 uloc_setKeywordValue(const char* keywordName,
866                      const char* keywordValue,
867                      char* buffer, int32_t bufferCapacity,
868                      UErrorCode* status)
869 {
870     /* TODO: sorting. removal. */
871     int32_t keywordNameLen;
872     int32_t keywordValueLen;
873     int32_t bufLen;
874     int32_t needLen = 0;
875     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
876     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
877     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
878     int32_t rc;
879     char* nextSeparator = NULL;
880     char* nextEqualsign = NULL;
881     char* startSearchHere = NULL;
882     char* keywordStart = NULL;
883     CharString updatedKeysAndValues;
884     UBool handledInputKeyAndValue = FALSE;
885     char keyValuePrefix = '@';
886 
887     if(U_FAILURE(*status)) {
888         return -1;
889     }
890     if (*status == U_STRING_NOT_TERMINATED_WARNING) {
891         *status = U_ZERO_ERROR;
892     }
893     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
894         *status = U_ILLEGAL_ARGUMENT_ERROR;
895         return 0;
896     }
897     bufLen = (int32_t)uprv_strlen(buffer);
898     if(bufferCapacity<bufLen) {
899         /* The capacity is less than the length?! Is this NULL terminated? */
900         *status = U_ILLEGAL_ARGUMENT_ERROR;
901         return 0;
902     }
903     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
904     if(U_FAILURE(*status)) {
905         return 0;
906     }
907 
908     keywordValueLen = 0;
909     if(keywordValue) {
910         while (*keywordValue != 0) {
911             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
912                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
913                 return 0;
914             }
915             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
916                 /* Should we force lowercase in value to set? */
917                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
918             } else {
919                 /* keywordValue too long for internal buffer */
920                 *status = U_INTERNAL_PROGRAM_ERROR;
921                 return 0;
922             }
923         }
924     }
925     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
926 
927     startSearchHere = (char*)locale_getKeywordsStart(buffer);
928     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
929         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
930             U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
931             return bufLen;
932         }
933 
934         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
935         if(startSearchHere) { /* had a single @ */
936             needLen--; /* already had the @ */
937             /* startSearchHere points at the @ */
938         } else {
939             startSearchHere=buffer+bufLen;
940         }
941         if(needLen >= bufferCapacity) {
942             *status = U_BUFFER_OVERFLOW_ERROR;
943             return needLen; /* no change */
944         }
945         *startSearchHere++ = '@';
946         uprv_strcpy(startSearchHere, keywordNameBuffer);
947         startSearchHere += keywordNameLen;
948         *startSearchHere++ = '=';
949         uprv_strcpy(startSearchHere, keywordValueBuffer);
950         U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
951         return needLen;
952     } /* end shortcut - no @ */
953 
954     keywordStart = startSearchHere;
955     /* search for keyword */
956     while(keywordStart) {
957         const char* keyValueTail;
958         int32_t keyValueLen;
959 
960         keywordStart++; /* skip @ or ; */
961         nextEqualsign = uprv_strchr(keywordStart, '=');
962         if (!nextEqualsign) {
963             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
964             return 0;
965         }
966         /* strip leading & trailing spaces (TC decided to tolerate these) */
967         while(*keywordStart == ' ') {
968             keywordStart++;
969         }
970         keyValueTail = nextEqualsign;
971         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
972             keyValueTail--;
973         }
974         /* now keyValueTail points to first char after the keyName */
975         /* copy & normalize keyName from locale */
976         if (keywordStart == keyValueTail) {
977             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
978             return 0;
979         }
980         keyValueLen = 0;
981         while (keywordStart < keyValueTail) {
982             if (!UPRV_ISALPHANUM(*keywordStart)) {
983                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
984                 return 0;
985             }
986             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
987                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
988             } else {
989                 /* keyword name too long for internal buffer */
990                 *status = U_INTERNAL_PROGRAM_ERROR;
991                 return 0;
992             }
993         }
994         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
995 
996         nextSeparator = uprv_strchr(nextEqualsign, ';');
997 
998         /* start processing the value part */
999         nextEqualsign++; /* skip '=' */
1000         /* First strip leading & trailing spaces (TC decided to tolerate these) */
1001         while(*nextEqualsign == ' ') {
1002             nextEqualsign++;
1003         }
1004         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1005         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1006             keyValueTail--;
1007         }
1008         if (nextEqualsign == keyValueTail) {
1009             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1010             return 0;
1011         }
1012 
1013         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1014         if(rc == 0) {
1015             /* Current entry matches the input keyword. Update the entry */
1016             if(keywordValueLen > 0) { /* updating a value */
1017                 updatedKeysAndValues.append(keyValuePrefix, *status);
1018                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1019                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1020                 updatedKeysAndValues.append('=', *status);
1021                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1022             } /* else removing this entry, don't emit anything */
1023             handledInputKeyAndValue = TRUE;
1024         } else {
1025            /* input keyword sorts earlier than current entry, add before current entry */
1026             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1027                 /* insert new entry at this location */
1028                 updatedKeysAndValues.append(keyValuePrefix, *status);
1029                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1030                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1031                 updatedKeysAndValues.append('=', *status);
1032                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1033                 handledInputKeyAndValue = TRUE;
1034             }
1035             /* copy the current entry */
1036             updatedKeysAndValues.append(keyValuePrefix, *status);
1037             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1038             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1039             updatedKeysAndValues.append('=', *status);
1040             updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1041         }
1042         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1043             /* append new entry at the end, it sorts later than existing entries */
1044             updatedKeysAndValues.append(keyValuePrefix, *status);
1045             /* skip keyValuePrefix update, no subsequent key-value pair */
1046             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1047             updatedKeysAndValues.append('=', *status);
1048             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1049             handledInputKeyAndValue = TRUE;
1050         }
1051         keywordStart = nextSeparator;
1052     } /* end loop searching */
1053 
1054     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1055      * problems with the passed-in locale. So if we did encounter problems with the
1056      * passed-in locale above, those errors took precedence and overrode any error
1057      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1058      * are errors here they are from updatedKeysAndValues.append; they do cause an
1059      * error return but the passed-in locale is unmodified and the original bufLen is
1060      * returned.
1061      */
1062     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1063         /* if input key/value specified removal of a keyword not present in locale, or
1064          * there was an error in CharString.append, leave original locale alone. */
1065         U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1066         return bufLen;
1067     }
1068 
1069     // needLen = length of the part before '@'
1070     needLen = (int32_t)(startSearchHere - buffer);
1071     // Check to see can we fit the startSearchHere, if not, return
1072     // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1073     // We do this because this API function does not behave like most others:
1074     // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1075     // When the contents fits but without the terminating NUL, in this case we need to not change
1076     // the buffer contents and return with a buffer overflow error.
1077     int32_t appendLength = updatedKeysAndValues.length();
1078     if (appendLength >= bufferCapacity - needLen) {
1079         *status = U_BUFFER_OVERFLOW_ERROR;
1080         return needLen + appendLength;
1081     }
1082     needLen += updatedKeysAndValues.extract(
1083                          startSearchHere, bufferCapacity - needLen, *status);
1084     U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1085     return needLen;
1086 }
1087 
1088 /* ### ID parsing implementation **************************************************/
1089 
1090 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1091 
1092 /*returns TRUE if one of the special prefixes is here (s=string)
1093   'x-' or 'i-' */
1094 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1095 
1096 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1097  * except for variant
1098  */
1099 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1100 
1101 /**
1102  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1103  * a NULL entry, followed by more entries, and a second NULL entry.
1104  *
1105  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1106  * COUNTRIES_3.
1107  */
_findIndex(const char * const * list,const char * key)1108 static int16_t _findIndex(const char* const* list, const char* key)
1109 {
1110     const char* const* anchor = list;
1111     int32_t pass = 0;
1112 
1113     /* Make two passes through two NULL-terminated arrays at 'list' */
1114     while (pass++ < 2) {
1115         while (*list) {
1116             if (uprv_strcmp(key, *list) == 0) {
1117                 return (int16_t)(list - anchor);
1118             }
1119             list++;
1120         }
1121         ++list;     /* skip final NULL *CWB*/
1122     }
1123     return -1;
1124 }
1125 
1126 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1127 uloc_getCurrentCountryID(const char* oldID){
1128     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1129     if (offset >= 0) {
1130         return REPLACEMENT_COUNTRIES[offset];
1131     }
1132     return oldID;
1133 }
1134 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1135 uloc_getCurrentLanguageID(const char* oldID){
1136     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1137     if (offset >= 0) {
1138         return REPLACEMENT_LANGUAGES[offset];
1139     }
1140     return oldID;
1141 }
1142 /*
1143  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1144  * avoid duplicating code to handle the earlier locale ID pieces
1145  * in the functions for the later ones by
1146  * setting the *pEnd pointer to where they stopped parsing
1147  *
1148  * TODO try to use this in Locale
1149  */
1150 CharString U_EXPORT2
ulocimp_getLanguage(const char * localeID,const char ** pEnd,UErrorCode & status)1151 ulocimp_getLanguage(const char *localeID,
1152                     const char **pEnd,
1153                     UErrorCode &status) {
1154     CharString result;
1155 
1156     if (uprv_stricmp(localeID, "root") == 0) {
1157         localeID += 4;
1158     } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1159                (localeID[3] == '\0' ||
1160                 localeID[3] == '-' ||
1161                 localeID[3] == '_' ||
1162                 localeID[3] == '@')) {
1163         localeID += 3;
1164     }
1165 
1166     /* if it starts with i- or x- then copy that prefix */
1167     if(_isIDPrefix(localeID)) {
1168         result.append((char)uprv_tolower(*localeID), status);
1169         result.append('-', status);
1170         localeID+=2;
1171     }
1172 
1173     /* copy the language as far as possible and count its length */
1174     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1175         result.append((char)uprv_tolower(*localeID), status);
1176         localeID++;
1177     }
1178 
1179     if(result.length()==3) {
1180         /* convert 3 character code to 2 character code if possible *CWB*/
1181         int32_t offset = _findIndex(LANGUAGES_3, result.data());
1182         if(offset>=0) {
1183             result.clear();
1184             result.append(LANGUAGES[offset], status);
1185         }
1186     }
1187 
1188     if(pEnd!=NULL) {
1189         *pEnd=localeID;
1190     }
1191 
1192     return result;
1193 }
1194 
1195 CharString U_EXPORT2
ulocimp_getScript(const char * localeID,const char ** pEnd,UErrorCode & status)1196 ulocimp_getScript(const char *localeID,
1197                   const char **pEnd,
1198                   UErrorCode &status) {
1199     CharString result;
1200     int32_t idLen = 0;
1201 
1202     if (pEnd != NULL) {
1203         *pEnd = localeID;
1204     }
1205 
1206     /* copy the second item as far as possible and count its length */
1207     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1208             && uprv_isASCIILetter(localeID[idLen])) {
1209         idLen++;
1210     }
1211 
1212     /* If it's exactly 4 characters long, then it's a script and not a country. */
1213     if (idLen == 4) {
1214         int32_t i;
1215         if (pEnd != NULL) {
1216             *pEnd = localeID+idLen;
1217         }
1218         if (idLen >= 1) {
1219             result.append((char)uprv_toupper(*(localeID++)), status);
1220         }
1221         for (i = 1; i < idLen; i++) {
1222             result.append((char)uprv_tolower(*(localeID++)), status);
1223         }
1224     }
1225 
1226     return result;
1227 }
1228 
1229 CharString U_EXPORT2
ulocimp_getCountry(const char * localeID,const char ** pEnd,UErrorCode & status)1230 ulocimp_getCountry(const char *localeID,
1231                    const char **pEnd,
1232                    UErrorCode &status) {
1233     CharString result;
1234     int32_t idLen=0;
1235 
1236     /* copy the country as far as possible and count its length */
1237     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1238         result.append((char)uprv_toupper(localeID[idLen]), status);
1239         idLen++;
1240     }
1241 
1242     /* the country should be either length 2 or 3 */
1243     if (idLen == 2 || idLen == 3) {
1244         /* convert 3 character code to 2 character code if possible *CWB*/
1245         if(idLen==3) {
1246             int32_t offset = _findIndex(COUNTRIES_3, result.data());
1247             if(offset>=0) {
1248                 result.clear();
1249                 result.append(COUNTRIES[offset], status);
1250             }
1251         }
1252         localeID+=idLen;
1253     } else {
1254         result.clear();
1255     }
1256 
1257     if(pEnd!=NULL) {
1258         *pEnd=localeID;
1259     }
1260 
1261     return result;
1262 }
1263 
1264 /**
1265  * @param needSeparator if true, then add leading '_' if any variants
1266  * are added to 'variant'
1267  */
1268 static void
_getVariant(const char * localeID,char prev,ByteSink & sink,UBool needSeparator)1269 _getVariant(const char *localeID,
1270             char prev,
1271             ByteSink& sink,
1272             UBool needSeparator) {
1273     UBool hasVariant = FALSE;
1274 
1275     /* get one or more variant tags and separate them with '_' */
1276     if(_isIDSeparator(prev)) {
1277         /* get a variant string after a '-' or '_' */
1278         while(!_isTerminator(*localeID)) {
1279             if (needSeparator) {
1280                 sink.Append("_", 1);
1281                 needSeparator = FALSE;
1282             }
1283             char c = (char)uprv_toupper(*localeID);
1284             if (c == '-') c = '_';
1285             sink.Append(&c, 1);
1286             hasVariant = TRUE;
1287             localeID++;
1288         }
1289     }
1290 
1291     /* if there is no variant tag after a '-' or '_' then look for '@' */
1292     if(!hasVariant) {
1293         if(prev=='@') {
1294             /* keep localeID */
1295         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1296             ++localeID; /* point after the '@' */
1297         } else {
1298             return;
1299         }
1300         while(!_isTerminator(*localeID)) {
1301             if (needSeparator) {
1302                 sink.Append("_", 1);
1303                 needSeparator = FALSE;
1304             }
1305             char c = (char)uprv_toupper(*localeID);
1306             if (c == '-' || c == ',') c = '_';
1307             sink.Append(&c, 1);
1308             localeID++;
1309         }
1310     }
1311 }
1312 
1313 /* Keyword enumeration */
1314 
1315 typedef struct UKeywordsContext {
1316     char* keywords;
1317     char* current;
1318 } UKeywordsContext;
1319 
1320 U_CDECL_BEGIN
1321 
1322 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1323 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1324     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1325     uprv_free(enumerator->context);
1326     uprv_free(enumerator);
1327 }
1328 
1329 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1330 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1331     char *kw = ((UKeywordsContext *)en->context)->keywords;
1332     int32_t result = 0;
1333     while(*kw) {
1334         result++;
1335         kw += uprv_strlen(kw)+1;
1336     }
1337     return result;
1338 }
1339 
1340 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1341 uloc_kw_nextKeyword(UEnumeration* en,
1342                     int32_t* resultLength,
1343                     UErrorCode* /*status*/) {
1344     const char* result = ((UKeywordsContext *)en->context)->current;
1345     int32_t len = 0;
1346     if(*result) {
1347         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1348         ((UKeywordsContext *)en->context)->current += len+1;
1349     } else {
1350         result = NULL;
1351     }
1352     if (resultLength) {
1353         *resultLength = len;
1354     }
1355     return result;
1356 }
1357 
1358 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1359 uloc_kw_resetKeywords(UEnumeration* en,
1360                       UErrorCode* /*status*/) {
1361     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1362 }
1363 
1364 U_CDECL_END
1365 
1366 
1367 static const UEnumeration gKeywordsEnum = {
1368     NULL,
1369     NULL,
1370     uloc_kw_closeKeywords,
1371     uloc_kw_countKeywords,
1372     uenum_unextDefault,
1373     uloc_kw_nextKeyword,
1374     uloc_kw_resetKeywords
1375 };
1376 
1377 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1378 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1379 {
1380     LocalMemory<UKeywordsContext> myContext;
1381     LocalMemory<UEnumeration> result;
1382 
1383     if (U_FAILURE(*status)) {
1384         return nullptr;
1385     }
1386     myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1387     result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1388     if (myContext.isNull() || result.isNull()) {
1389         *status = U_MEMORY_ALLOCATION_ERROR;
1390         return nullptr;
1391     }
1392     uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1393     myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1394     if (myContext->keywords == nullptr) {
1395         *status = U_MEMORY_ALLOCATION_ERROR;
1396         return nullptr;
1397     }
1398     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1399     myContext->keywords[keywordListSize] = 0;
1400     myContext->current = myContext->keywords;
1401     result->context = myContext.orphan();
1402     return result.orphan();
1403 }
1404 
1405 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1406 uloc_openKeywords(const char* localeID,
1407                         UErrorCode* status)
1408 {
1409     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1410     const char* tmpLocaleID;
1411 
1412     if(status==NULL || U_FAILURE(*status)) {
1413         return 0;
1414     }
1415 
1416     if (_hasBCP47Extension(localeID)) {
1417         tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
1418                                     sizeof(tempBuffer), status, nullptr);
1419     } else {
1420         if (localeID==NULL) {
1421             localeID=uloc_getDefault();
1422         }
1423         tmpLocaleID=localeID;
1424     }
1425 
1426     /* Skip the language */
1427     ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *status);
1428     if (U_FAILURE(*status)) {
1429         return 0;
1430     }
1431 
1432     if(_isIDSeparator(*tmpLocaleID)) {
1433         const char *scriptID;
1434         /* Skip the script if available */
1435         ulocimp_getScript(tmpLocaleID+1, &scriptID, *status);
1436         if (U_FAILURE(*status)) {
1437             return 0;
1438         }
1439         if(scriptID != tmpLocaleID+1) {
1440             /* Found optional script */
1441             tmpLocaleID = scriptID;
1442         }
1443         /* Skip the Country */
1444         if (_isIDSeparator(*tmpLocaleID)) {
1445             ulocimp_getCountry(tmpLocaleID+1, &tmpLocaleID, *status);
1446             if (U_FAILURE(*status)) {
1447                 return 0;
1448             }
1449         }
1450     }
1451 
1452     /* keywords are located after '@' */
1453     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1454         CharString keywords;
1455         CharStringByteSink sink(&keywords);
1456         ulocimp_getKeywords(tmpLocaleID+1, '@', sink, FALSE, status);
1457         if (U_FAILURE(*status)) {
1458             return NULL;
1459         }
1460         return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1461     }
1462     return NULL;
1463 }
1464 
1465 
1466 /* bit-flags for 'options' parameter of _canonicalize */
1467 #define _ULOC_STRIP_KEYWORDS 0x2
1468 #define _ULOC_CANONICALIZE   0x1
1469 
1470 #define OPTION_SET(options, mask) ((options & mask) != 0)
1471 
1472 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1473 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1474 
1475 /**
1476  * Canonicalize the given localeID, to level 1 or to level 2,
1477  * depending on the options.  To specify level 1, pass in options=0.
1478  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1479  *
1480  * This is the code underlying uloc_getName and uloc_canonicalize.
1481  */
1482 static void
_canonicalize(const char * localeID,ByteSink & sink,uint32_t options,UErrorCode * err)1483 _canonicalize(const char* localeID,
1484               ByteSink& sink,
1485               uint32_t options,
1486               UErrorCode* err) {
1487     if (U_FAILURE(*err)) {
1488         return;
1489     }
1490 
1491     int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
1492     PreflightingLocaleIDBuffer tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
1493     CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1494     const char* origLocaleID;
1495     const char* tmpLocaleID;
1496     const char* keywordAssign = NULL;
1497     const char* separatorIndicator = NULL;
1498 
1499     if (_hasBCP47Extension(localeID)) {
1500         const char* localeIDPtr = localeID;
1501 
1502         // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1503         if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
1504             localeIDWithHyphens.append(localeID, -1, *err);
1505             if (U_SUCCESS(*err)) {
1506                 for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1507                     if (*p == '_') {
1508                         *p = '-';
1509                     }
1510                 }
1511                 localeIDPtr = localeIDWithHyphens.data();
1512             }
1513         }
1514 
1515         do {
1516             // After this call tmpLocaleID may point to localeIDPtr which may
1517             // point to either localeID or localeIDWithHyphens.data().
1518             tmpLocaleID = _ConvertBCP47(localeIDPtr, tempBuffer.getBuffer(),
1519                                         tempBuffer.getCapacity(), err,
1520                                         &(tempBuffer.requestedCapacity));
1521         } while (tempBuffer.needToTryAgain(err));
1522     } else {
1523         if (localeID==NULL) {
1524            localeID=uloc_getDefault();
1525         }
1526         tmpLocaleID=localeID;
1527     }
1528 
1529     origLocaleID=tmpLocaleID;
1530 
1531     /* get all pieces, one after another, and separate with '_' */
1532     CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1533 
1534     if (tag.length() == I_DEFAULT_LENGTH &&
1535             uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1536         tag.clear();
1537         tag.append(uloc_getDefault(), *err);
1538     } else if(_isIDSeparator(*tmpLocaleID)) {
1539         const char *scriptID;
1540 
1541         ++fieldCount;
1542         tag.append('_', *err);
1543 
1544         CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1545         tag.append(script, *err);
1546         scriptSize = script.length();
1547         if(scriptSize > 0) {
1548             /* Found optional script */
1549             tmpLocaleID = scriptID;
1550             ++fieldCount;
1551             if (_isIDSeparator(*tmpLocaleID)) {
1552                 /* If there is something else, then we add the _ */
1553                 tag.append('_', *err);
1554             }
1555         }
1556 
1557         if (_isIDSeparator(*tmpLocaleID)) {
1558             const char *cntryID;
1559 
1560             CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1561             tag.append(country, *err);
1562             if (!country.isEmpty()) {
1563                 /* Found optional country */
1564                 tmpLocaleID = cntryID;
1565             }
1566             if(_isIDSeparator(*tmpLocaleID)) {
1567                 /* If there is something else, then we add the _  if we found country before. */
1568                 if (!_isIDSeparator(*(tmpLocaleID+1))) {
1569                     ++fieldCount;
1570                     tag.append('_', *err);
1571                 }
1572 
1573                 variantSize = -tag.length();
1574                 {
1575                     CharStringByteSink s(&tag);
1576                     _getVariant(tmpLocaleID+1, *tmpLocaleID, s, FALSE);
1577                 }
1578                 variantSize += tag.length();
1579                 if (variantSize > 0) {
1580                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1581                 }
1582             }
1583         }
1584     }
1585 
1586     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1587     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1588         UBool done = FALSE;
1589         do {
1590             char c = *tmpLocaleID;
1591             switch (c) {
1592             case 0:
1593             case '@':
1594                 done = TRUE;
1595                 break;
1596             default:
1597                 tag.append(c, *err);
1598                 ++tmpLocaleID;
1599                 break;
1600             }
1601         } while (!done);
1602     }
1603 
1604     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1605        After this, tmpLocaleID either points to '@' or is NULL */
1606     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1607         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1608         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1609     }
1610 
1611     /* Copy POSIX-style variant, if any [mr@FOO] */
1612     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1613         tmpLocaleID != NULL && keywordAssign == NULL) {
1614         for (;;) {
1615             char c = *tmpLocaleID;
1616             if (c == 0) {
1617                 break;
1618             }
1619             tag.append(c, *err);
1620             ++tmpLocaleID;
1621         }
1622     }
1623 
1624     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1625         /* Handle @FOO variant if @ is present and not followed by = */
1626         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1627             /* Add missing '_' if needed */
1628             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1629                 do {
1630                     tag.append('_', *err);
1631                     ++fieldCount;
1632                 } while(fieldCount<2);
1633             }
1634 
1635             int32_t posixVariantSize = -tag.length();
1636             {
1637                 CharStringByteSink s(&tag);
1638                 _getVariant(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
1639             }
1640             posixVariantSize += tag.length();
1641             if (posixVariantSize > 0) {
1642                 variantSize += posixVariantSize;
1643             }
1644         }
1645 
1646         /* Look up the ID in the canonicalization map */
1647         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1648             StringPiece id(CANONICALIZE_MAP[j].id);
1649             if (tag == id) {
1650                 if (id.empty() && tmpLocaleID != NULL) {
1651                     break; /* Don't remap "" if keywords present */
1652                 }
1653                 tag.clear();
1654                 tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1655                 break;
1656             }
1657         }
1658     }
1659 
1660     sink.Append(tag.data(), tag.length());
1661 
1662     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1663         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1664             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1665             sink.Append("@", 1);
1666             ++fieldCount;
1667             ulocimp_getKeywords(tmpLocaleID+1, '@', sink, TRUE, err);
1668         }
1669     }
1670 }
1671 
1672 /* ### ID parsing API **************************************************/
1673 
1674 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1675 uloc_getParent(const char*    localeID,
1676                char* parent,
1677                int32_t parentCapacity,
1678                UErrorCode* err)
1679 {
1680     const char *lastUnderscore;
1681     int32_t i;
1682 
1683     if (U_FAILURE(*err))
1684         return 0;
1685 
1686     if (localeID == NULL)
1687         localeID = uloc_getDefault();
1688 
1689     lastUnderscore=uprv_strrchr(localeID, '_');
1690     if(lastUnderscore!=NULL) {
1691         i=(int32_t)(lastUnderscore-localeID);
1692     } else {
1693         i=0;
1694     }
1695 
1696     if (i > 0) {
1697         if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1698             localeID += 3;
1699             i -= 3;
1700             uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1701         } else if (parent != localeID) {
1702             uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1703         }
1704     }
1705 
1706     return u_terminateChars(parent, parentCapacity, i, err);
1707 }
1708 
1709 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1710 uloc_getLanguage(const char*    localeID,
1711          char* language,
1712          int32_t languageCapacity,
1713          UErrorCode* err)
1714 {
1715     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1716 
1717     if (err==NULL || U_FAILURE(*err)) {
1718         return 0;
1719     }
1720 
1721     if(localeID==NULL) {
1722         localeID=uloc_getDefault();
1723     }
1724 
1725     return ulocimp_getLanguage(localeID, NULL, *err).extract(language, languageCapacity, *err);
1726 }
1727 
1728 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1729 uloc_getScript(const char*    localeID,
1730          char* script,
1731          int32_t scriptCapacity,
1732          UErrorCode* err)
1733 {
1734     if(err==NULL || U_FAILURE(*err)) {
1735         return 0;
1736     }
1737 
1738     if(localeID==NULL) {
1739         localeID=uloc_getDefault();
1740     }
1741 
1742     /* skip the language */
1743     ulocimp_getLanguage(localeID, &localeID, *err);
1744     if (U_FAILURE(*err)) {
1745         return 0;
1746     }
1747 
1748     if(_isIDSeparator(*localeID)) {
1749         return ulocimp_getScript(localeID+1, NULL, *err).extract(script, scriptCapacity, *err);
1750     }
1751     return u_terminateChars(script, scriptCapacity, 0, err);
1752 }
1753 
1754 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1755 uloc_getCountry(const char* localeID,
1756             char* country,
1757             int32_t countryCapacity,
1758             UErrorCode* err)
1759 {
1760     if(err==NULL || U_FAILURE(*err)) {
1761         return 0;
1762     }
1763 
1764     if(localeID==NULL) {
1765         localeID=uloc_getDefault();
1766     }
1767 
1768     /* Skip the language */
1769     ulocimp_getLanguage(localeID, &localeID, *err);
1770     if (U_FAILURE(*err)) {
1771         return 0;
1772     }
1773 
1774     if(_isIDSeparator(*localeID)) {
1775         const char *scriptID;
1776         /* Skip the script if available */
1777         ulocimp_getScript(localeID+1, &scriptID, *err);
1778         if (U_FAILURE(*err)) {
1779             return 0;
1780         }
1781         if(scriptID != localeID+1) {
1782             /* Found optional script */
1783             localeID = scriptID;
1784         }
1785         if(_isIDSeparator(*localeID)) {
1786             return ulocimp_getCountry(localeID+1, NULL, *err).extract(country, countryCapacity, *err);
1787         }
1788     }
1789     return u_terminateChars(country, countryCapacity, 0, err);
1790 }
1791 
1792 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1793 uloc_getVariant(const char* localeID,
1794                 char* variant,
1795                 int32_t variantCapacity,
1796                 UErrorCode* err)
1797 {
1798     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1799     const char* tmpLocaleID;
1800     int32_t i=0;
1801 
1802     if(err==NULL || U_FAILURE(*err)) {
1803         return 0;
1804     }
1805 
1806     if (_hasBCP47Extension(localeID)) {
1807         tmpLocaleID =_ConvertBCP47(localeID, tempBuffer, sizeof(tempBuffer), err, nullptr);
1808     } else {
1809         if (localeID==NULL) {
1810            localeID=uloc_getDefault();
1811         }
1812         tmpLocaleID=localeID;
1813     }
1814 
1815     /* Skip the language */
1816     ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1817     if (U_FAILURE(*err)) {
1818         return 0;
1819     }
1820 
1821     if(_isIDSeparator(*tmpLocaleID)) {
1822         const char *scriptID;
1823         /* Skip the script if available */
1824         ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1825         if (U_FAILURE(*err)) {
1826             return 0;
1827         }
1828         if(scriptID != tmpLocaleID+1) {
1829             /* Found optional script */
1830             tmpLocaleID = scriptID;
1831         }
1832         /* Skip the Country */
1833         if (_isIDSeparator(*tmpLocaleID)) {
1834             const char *cntryID;
1835             ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1836             if (U_FAILURE(*err)) {
1837                 return 0;
1838             }
1839             if (cntryID != tmpLocaleID+1) {
1840                 /* Found optional country */
1841                 tmpLocaleID = cntryID;
1842             }
1843             if(_isIDSeparator(*tmpLocaleID)) {
1844                 /* If there was no country ID, skip a possible extra IDSeparator */
1845                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1846                     tmpLocaleID++;
1847                 }
1848 
1849                 CheckedArrayByteSink sink(variant, variantCapacity);
1850                 _getVariant(tmpLocaleID+1, *tmpLocaleID, sink, FALSE);
1851 
1852                 i = sink.NumberOfBytesAppended();
1853 
1854                 if (U_FAILURE(*err)) {
1855                     return i;
1856                 }
1857 
1858                 if (sink.Overflowed()) {
1859                     *err = U_BUFFER_OVERFLOW_ERROR;
1860                     return i;
1861                 }
1862             }
1863         }
1864     }
1865 
1866     return u_terminateChars(variant, variantCapacity, i, err);
1867 }
1868 
1869 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1870 uloc_getName(const char* localeID,
1871              char* name,
1872              int32_t nameCapacity,
1873              UErrorCode* err)
1874 {
1875     if (U_FAILURE(*err)) {
1876         return 0;
1877     }
1878 
1879     CheckedArrayByteSink sink(name, nameCapacity);
1880     ulocimp_getName(localeID, sink, err);
1881 
1882     int32_t reslen = sink.NumberOfBytesAppended();
1883 
1884     if (U_FAILURE(*err)) {
1885         return reslen;
1886     }
1887 
1888     if (sink.Overflowed()) {
1889         *err = U_BUFFER_OVERFLOW_ERROR;
1890     } else {
1891         u_terminateChars(name, nameCapacity, reslen, err);
1892     }
1893 
1894     return reslen;
1895 }
1896 
1897 U_CAPI void U_EXPORT2
ulocimp_getName(const char * localeID,ByteSink & sink,UErrorCode * err)1898 ulocimp_getName(const char* localeID,
1899                 ByteSink& sink,
1900                 UErrorCode* err)
1901 {
1902     _canonicalize(localeID, sink, 0, err);
1903 }
1904 
1905 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1906 uloc_getBaseName(const char* localeID,
1907                  char* name,
1908                  int32_t nameCapacity,
1909                  UErrorCode* err)
1910 {
1911     if (U_FAILURE(*err)) {
1912         return 0;
1913     }
1914 
1915     CheckedArrayByteSink sink(name, nameCapacity);
1916     ulocimp_getBaseName(localeID, sink, err);
1917 
1918     int32_t reslen = sink.NumberOfBytesAppended();
1919 
1920     if (U_FAILURE(*err)) {
1921         return reslen;
1922     }
1923 
1924     if (sink.Overflowed()) {
1925         *err = U_BUFFER_OVERFLOW_ERROR;
1926     } else {
1927         u_terminateChars(name, nameCapacity, reslen, err);
1928     }
1929 
1930     return reslen;
1931 }
1932 
1933 U_CAPI void U_EXPORT2
ulocimp_getBaseName(const char * localeID,ByteSink & sink,UErrorCode * err)1934 ulocimp_getBaseName(const char* localeID,
1935                     ByteSink& sink,
1936                     UErrorCode* err)
1937 {
1938     _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1939 }
1940 
1941 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1942 uloc_canonicalize(const char* localeID,
1943                   char* name,
1944                   int32_t nameCapacity,
1945                   UErrorCode* err)
1946 {
1947     if (U_FAILURE(*err)) {
1948         return 0;
1949     }
1950 
1951     CheckedArrayByteSink sink(name, nameCapacity);
1952     ulocimp_canonicalize(localeID, sink, err);
1953 
1954     int32_t reslen = sink.NumberOfBytesAppended();
1955 
1956     if (U_FAILURE(*err)) {
1957         return reslen;
1958     }
1959 
1960     if (sink.Overflowed()) {
1961         *err = U_BUFFER_OVERFLOW_ERROR;
1962     } else {
1963         u_terminateChars(name, nameCapacity, reslen, err);
1964     }
1965 
1966     return reslen;
1967 }
1968 
1969 U_CAPI void U_EXPORT2
ulocimp_canonicalize(const char * localeID,ByteSink & sink,UErrorCode * err)1970 ulocimp_canonicalize(const char* localeID,
1971                      ByteSink& sink,
1972                      UErrorCode* err)
1973 {
1974     _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1975 }
1976 
1977 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)1978 uloc_getISO3Language(const char* localeID)
1979 {
1980     int16_t offset;
1981     char lang[ULOC_LANG_CAPACITY];
1982     UErrorCode err = U_ZERO_ERROR;
1983 
1984     if (localeID == NULL)
1985     {
1986         localeID = uloc_getDefault();
1987     }
1988     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1989     if (U_FAILURE(err))
1990         return "";
1991     offset = _findIndex(LANGUAGES, lang);
1992     if (offset < 0)
1993         return "";
1994     return LANGUAGES_3[offset];
1995 }
1996 
1997 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)1998 uloc_getISO3Country(const char* localeID)
1999 {
2000     int16_t offset;
2001     char cntry[ULOC_LANG_CAPACITY];
2002     UErrorCode err = U_ZERO_ERROR;
2003 
2004     if (localeID == NULL)
2005     {
2006         localeID = uloc_getDefault();
2007     }
2008     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2009     if (U_FAILURE(err))
2010         return "";
2011     offset = _findIndex(COUNTRIES, cntry);
2012     if (offset < 0)
2013         return "";
2014 
2015     return COUNTRIES_3[offset];
2016 }
2017 
2018 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2019 uloc_getLCID(const char* localeID)
2020 {
2021     UErrorCode status = U_ZERO_ERROR;
2022     char       langID[ULOC_FULLNAME_CAPACITY];
2023     uint32_t   lcid = 0;
2024 
2025     /* Check for incomplete id. */
2026     if (!localeID || uprv_strlen(localeID) < 2) {
2027         return 0;
2028     }
2029 
2030     // First, attempt Windows platform lookup if available, but fall
2031     // through to catch any special cases (ICU vs Windows name differences).
2032     lcid = uprv_convertToLCIDPlatform(localeID, &status);
2033     if (U_FAILURE(status)) {
2034         return 0;
2035     }
2036     if (lcid > 0) {
2037         // Windows found an LCID, return that
2038         return lcid;
2039     }
2040 
2041     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2042     if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2043         return 0;
2044     }
2045 
2046     if (uprv_strchr(localeID, '@')) {
2047         // uprv_convertToLCID does not support keywords other than collation.
2048         // Remove all keywords except collation.
2049         int32_t len;
2050         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2051 
2052         CharString collVal;
2053         {
2054             CharStringByteSink sink(&collVal);
2055             ulocimp_getKeywordValue(localeID, "collation", sink, &status);
2056         }
2057 
2058         if (U_SUCCESS(status) && !collVal.isEmpty()) {
2059             len = uloc_getBaseName(localeID, tmpLocaleID,
2060                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2061 
2062             if (U_SUCCESS(status) && len > 0) {
2063                 tmpLocaleID[len] = 0;
2064 
2065                 len = uloc_setKeywordValue("collation", collVal.data(), tmpLocaleID,
2066                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2067 
2068                 if (U_SUCCESS(status) && len > 0) {
2069                     tmpLocaleID[len] = 0;
2070                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2071                 }
2072             }
2073         }
2074 
2075         // fall through - all keywords are simply ignored
2076         status = U_ZERO_ERROR;
2077     }
2078 
2079     return uprv_convertToLCID(langID, localeID, &status);
2080 }
2081 
2082 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2083 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2084                 UErrorCode *status)
2085 {
2086     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2087 }
2088 
2089 /* ### Default locale **************************************************/
2090 
2091 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2092 uloc_getDefault()
2093 {
2094     return locale_get_default();
2095 }
2096 
2097 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2098 uloc_setDefault(const char*   newDefaultLocale,
2099              UErrorCode* err)
2100 {
2101     if (U_FAILURE(*err))
2102         return;
2103     /* the error code isn't currently used for anything by this function*/
2104 
2105     /* propagate change to C++ */
2106     locale_set_default(newDefaultLocale);
2107 }
2108 
2109 /**
2110  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2111  * to an array of pointers to arrays of char.  All of these pointers are owned
2112  * by ICU-- do not delete them, and do not write through them.  The array is
2113  * terminated with a null pointer.
2114  */
2115 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2116 uloc_getISOLanguages()
2117 {
2118     return LANGUAGES;
2119 }
2120 
2121 /**
2122  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2123  * pointer to an array of pointers to arrays of char.  All of these pointers are
2124  * owned by ICU-- do not delete them, and do not write through them.  The array is
2125  * terminated with a null pointer.
2126  */
2127 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2128 uloc_getISOCountries()
2129 {
2130     return COUNTRIES;
2131 }
2132 
2133 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2134 uloc_toUnicodeLocaleKey(const char* keyword)
2135 {
2136     const char* bcpKey = ulocimp_toBcpKey(keyword);
2137     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2138         // unknown keyword, but syntax is fine..
2139         return keyword;
2140     }
2141     return bcpKey;
2142 }
2143 
2144 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2145 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2146 {
2147     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2148     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2149         // unknown keyword, but syntax is fine..
2150         return value;
2151     }
2152     return bcpType;
2153 }
2154 
2155 static UBool
isWellFormedLegacyKey(const char * legacyKey)2156 isWellFormedLegacyKey(const char* legacyKey)
2157 {
2158     const char* p = legacyKey;
2159     while (*p) {
2160         if (!UPRV_ISALPHANUM(*p)) {
2161             return FALSE;
2162         }
2163         p++;
2164     }
2165     return TRUE;
2166 }
2167 
2168 static UBool
isWellFormedLegacyType(const char * legacyType)2169 isWellFormedLegacyType(const char* legacyType)
2170 {
2171     const char* p = legacyType;
2172     int32_t alphaNumLen = 0;
2173     while (*p) {
2174         if (*p == '_' || *p == '/' || *p == '-') {
2175             if (alphaNumLen == 0) {
2176                 return FALSE;
2177             }
2178             alphaNumLen = 0;
2179         } else if (UPRV_ISALPHANUM(*p)) {
2180             alphaNumLen++;
2181         } else {
2182             return FALSE;
2183         }
2184         p++;
2185     }
2186     return (alphaNumLen != 0);
2187 }
2188 
2189 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2190 uloc_toLegacyKey(const char* keyword)
2191 {
2192     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2193     if (legacyKey == NULL) {
2194         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2195         //
2196         // Note:
2197         //  LDML/CLDR provides some definition of keyword syntax in
2198         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2199         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2200         //  Keys can only consist of [0-9a-zA-Z].
2201         if (isWellFormedLegacyKey(keyword)) {
2202             return keyword;
2203         }
2204     }
2205     return legacyKey;
2206 }
2207 
2208 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2209 uloc_toLegacyType(const char* keyword, const char* value)
2210 {
2211     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2212     if (legacyType == NULL) {
2213         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2214         //
2215         // Note:
2216         //  LDML/CLDR provides some definition of keyword syntax in
2217         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2218         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2219         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2220         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2221         if (isWellFormedLegacyType(value)) {
2222             return value;
2223         }
2224     }
2225     return legacyType;
2226 }
2227 
2228 /*eof*/
2229