• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1997-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   04/01/97    aliu        Creation.
15 *   08/21/98    stephen     JDK 1.2 sync
16 *   12/08/98    rtg         New Locale implementation and C API
17 *   03/15/99    damiba      overhaul.
18 *   04/06/99    stephen     changed setDefault() to realloc and copy
19 *   06/14/99    stephen     Changed calls to ures_open for new params
20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22 *                           brought canonicalization code into line with spec
23 *****************************************************************************/
24 
25 /*
26    POSIX's locale format, from putil.c: [no spaces]
27 
28      ll [ _CC ] [ . MM ] [ @ VV]
29 
30      l = lang, C = ctry, M = charmap, V = variant
31 */
32 
33 #include "unicode/utypes.h"
34 #include "unicode/ustring.h"
35 #include "unicode/uloc.h"
36 
37 #include "putilimp.h"
38 #include "ustr_imp.h"
39 #include "ulocimp.h"
40 #include "umutex.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43 #include "locmap.h"
44 #include "uarrsort.h"
45 #include "uenumimp.h"
46 #include "uassert.h"
47 
48 #include <stdio.h> /* for sprintf */
49 
50 using namespace icu;
51 
52 /* ### Declarations **************************************************/
53 
54 /* Locale stuff from locid.cpp */
55 U_CFUNC void locale_set_default(const char *id);
56 U_CFUNC const char *locale_get_default(void);
57 U_CFUNC int32_t
58 locale_getKeywords(const char *localeID,
59             char prev,
60             char *keywords, int32_t keywordCapacity,
61             char *values, int32_t valuesCapacity, int32_t *valLen,
62             UBool valuesToo,
63             UErrorCode *status);
64 
65 /* ### Data tables **************************************************/
66 
67 /**
68  * Table of language codes, both 2- and 3-letter, with preference
69  * given to 2-letter codes where possible.  Includes 3-letter codes
70  * that lack a 2-letter equivalent.
71  *
72  * This list must be in sorted order.  This list is returned directly
73  * to the user by some API.
74  *
75  * This list must be kept in sync with LANGUAGES_3, with corresponding
76  * entries matched.
77  *
78  * This table should be terminated with a NULL entry, followed by a
79  * second list, and another NULL entry.  The first list is visible to
80  * user code when this array is returned by API.  The second list
81  * contains codes we support, but do not expose through user API.
82  *
83  * Notes
84  *
85  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
86  * include the revisions up to 2001/7/27 *CWB*
87  *
88  * The 3 character codes are the terminology codes like RFC 3066.  This
89  * is compatible with prior ICU codes
90  *
91  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
92  * table but now at the end of the table because 3 character codes are
93  * duplicates.  This avoids bad searches going from 3 to 2 character
94  * codes.
95  *
96  * The range qaa-qtz is reserved for local use
97  */
98 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
99 /* ISO639 table version is 20150505 */
100 static const char * const LANGUAGES[] = {
101     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
102     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
103     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
104     "arn", "aro", "arp", "arq", "arw", "ary", "arz", "as",
105     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
106     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
107     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
108     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
109     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
110     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
111     "ca",  "cad", "car", "cay", "cch", "ce",  "ceb", "cgg",
112     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
113     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
114     "cs",  "csb", "cu",  "cv",  "cy",
115     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
116     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
117     "dyo", "dyu", "dz",  "dzg",
118     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
119     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
120     "ext",
121     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
122     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
123     "frs", "fur", "fy",
124     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
125     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
126     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
127     "gur", "guz", "gv",  "gwi",
128     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
129     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
130     "hup", "hy",  "hz",
131     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
132     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
133     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
134     "jv",
135     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
136     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
137     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
138     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
139     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
140     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
141     "kv",  "kw",  "ky",
142     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
143     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
144     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
145     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
146     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
147     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
148     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
149     "ml",  "mn",  "mnc", "mni", "moh", "mos", "mr",  "mrj",
150     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
151     "my",  "mye", "myv", "mzn",
152     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
153     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
154     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
155     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
156     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
157     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
158     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
159     "pon", "prg", "pro", "ps",  "pt",
160     "qu",  "quc", "qug",
161     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
162     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
163     "rw",  "rwk",
164     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
165     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
166     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
167     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
168     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
169     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
170     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
171     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
172     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
173     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
174     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
175     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
176     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
177     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
178     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
179     "vot", "vro", "vun",
180     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
181     "xal", "xh",  "xmf", "xog",
182     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
183     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
184     "zun", "zxx", "zza",
185 NULL,
186     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
187 NULL
188 };
189 
190 static const char* const DEPRECATED_LANGUAGES[]={
191     "in", "iw", "ji", "jw", NULL, NULL
192 };
193 static const char* const REPLACEMENT_LANGUAGES[]={
194     "id", "he", "yi", "jv", NULL, NULL
195 };
196 
197 /**
198  * Table of 3-letter language codes.
199  *
200  * This is a lookup table used to convert 3-letter language codes to
201  * their 2-letter equivalent, where possible.  It must be kept in sync
202  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
203  * same language as LANGUAGES_3[i].  The commented-out lines are
204  * copied from LANGUAGES to make eyeballing this baby easier.
205  *
206  * Where a 3-letter language code has no 2-letter equivalent, the
207  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
208  *
209  * This table should be terminated with a NULL entry, followed by a
210  * second list, and another NULL entry.  The two lists correspond to
211  * the two lists in LANGUAGES.
212  */
213 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
214 /* ISO639 table version is 20150505 */
215 static const char * const LANGUAGES_3[] = {
216     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
217     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
218     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
219     "arn", "aro", "arp", "arq", "arw", "ary", "arz", "asm",
220     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
221     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
222     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
223     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
224     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
225     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
226     "cat", "cad", "car", "cay", "cch", "che", "ceb", "cgg",
227     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
228     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
229     "ces", "csb", "chu", "chv", "cym",
230     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
231     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
232     "dyo", "dyu", "dzo", "dzg",
233     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
234     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
235     "ext",
236     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
237     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
238     "frs", "fur", "fry",
239     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
240     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
241     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
242     "gur", "guz", "glv", "gwi",
243     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
244     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
245     "hup", "hye", "her",
246     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
247     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
248     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
249     "jav",
250     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
251     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
252     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
253     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
254     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
255     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
256     "kom", "cor", "kir",
257     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
258     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
259     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
260     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
261     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
262     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
263     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
264     "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
265     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266     "mya", "mye", "myv", "mzn",
267     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
273     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274     "pon", "prg", "pro", "pus", "por",
275     "que", "quc", "qug",
276     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278     "kin", "rwk",
279     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
287     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
289     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
290     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
294     "vot", "vro", "vun",
295     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296     "xal", "xho", "xmf", "xog",
297     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299     "zun", "zxx", "zza",
300 NULL,
301 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
302     "ind", "heb", "yid", "jaw", "srp",
303 NULL
304 };
305 
306 /**
307  * Table of 2-letter country codes.
308  *
309  * This list must be in sorted order.  This list is returned directly
310  * to the user by some API.
311  *
312  * This list must be kept in sync with COUNTRIES_3, with corresponding
313  * entries matched.
314  *
315  * This table should be terminated with a NULL entry, followed by a
316  * second list, and another NULL entry.  The first list is visible to
317  * user code when this array is returned by API.  The second list
318  * contains codes we support, but do not expose through user API.
319  *
320  * Notes:
321  *
322  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324  * new codes keeping the old ones for compatibility updated to include
325  * 1999/12/03 revisions *CWB*
326  *
327  * RO(ROM) is now RO(ROU) according to
328  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329  */
330 static const char * const COUNTRIES[] = {
331     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
332     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
333     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
334     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
335     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
336     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
337     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
338     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
339     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
340     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
341     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
342     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
343     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
344     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
345     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
346     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
347     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
348     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
349     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
350     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
351     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
352     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
353     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
354     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
355     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
356     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
357     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
358     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
359     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
360     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
361 NULL,
362     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
363 NULL
364 };
365 
366 static const char* const DEPRECATED_COUNTRIES[] = {
367     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
368 };
369 static const char* const REPLACEMENT_COUNTRIES[] = {
370 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
372 };
373 
374 /**
375  * Table of 3-letter country codes.
376  *
377  * This is a lookup table used to convert 3-letter country codes to
378  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
379  * For all valid i, COUNTRIES[i] must refer to the same country as
380  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
381  * to make eyeballing this baby easier.
382  *
383  * This table should be terminated with a NULL entry, followed by a
384  * second list, and another NULL entry.  The two lists correspond to
385  * the two lists in COUNTRIES.
386  */
387 static const char * const COUNTRIES_3[] = {
388 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
389     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
391     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
393     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
395     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
397     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
399     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
400 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
401     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
402 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
403     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
404 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
405     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
407     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
409     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
411     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
413     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
415     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
417     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
419     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
421     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
423     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
425     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
427     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
429     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
431     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
433     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
435     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
437     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
439     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
441     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
443     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
445     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
447     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448 NULL,
449 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
450     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451 NULL
452 };
453 
454 typedef struct CanonicalizationMap {
455     const char *id;          /* input ID */
456     const char *canonicalID; /* canonicalized output ID */
457     const char *keyword;     /* keyword, or NULL if none */
458     const char *value;       /* keyword value, or NULL if kw==NULL */
459 } CanonicalizationMap;
460 
461 /**
462  * A map to canonicalize locale IDs.  This handles a variety of
463  * different semantic kinds of transformations.
464  */
465 static const CanonicalizationMap CANONICALIZE_MAP[] = {
466     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
467     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
468     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
469     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
470     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
471     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
472     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
473     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
474     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
475     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
476     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
477     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
478     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
479     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
480     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
481     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
482     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
483     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
484     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
485     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
486     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
487     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
488     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
489     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
490     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
491     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
492     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
493     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
494     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
495     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
496     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
497     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
498     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
499     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
500     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
501     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
502     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
503     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
504     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
505     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
506     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
507     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
508     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
509     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
510     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
511     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
512 };
513 
514 typedef struct VariantMap {
515     const char *variant;          /* input ID */
516     const char *keyword;     /* keyword, or NULL if none */
517     const char *value;       /* keyword value, or NULL if kw==NULL */
518 } VariantMap;
519 
520 static const VariantMap VARIANT_MAP[] = {
521     { "EURO",   "currency", "EUR" },
522     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
523     { "STROKE", "collation", "stroke" }  /* Solaris variant */
524 };
525 
526 /* ### BCP47 Conversion *******************************************/
527 /* Test if the locale id has BCP47 u extension and does not have '@' */
528 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
529 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
530 #define _ConvertBCP47(finalID, id, buffer, length,err) \
531         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
532             finalID=id; \
533         } else { \
534             finalID=buffer; \
535         }
536 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)537 static int32_t getShortestSubtagLength(const char *localeID) {
538     int32_t localeIDLength = uprv_strlen(localeID);
539     int32_t length = localeIDLength;
540     int32_t tmpLength = 0;
541     int32_t i;
542     UBool reset = TRUE;
543 
544     for (i = 0; i < localeIDLength; i++) {
545         if (localeID[i] != '_' && localeID[i] != '-') {
546             if (reset) {
547                 tmpLength = 0;
548                 reset = FALSE;
549             }
550             tmpLength++;
551         } else {
552             if (tmpLength != 0 && tmpLength < length) {
553                 length = tmpLength;
554             }
555             reset = TRUE;
556         }
557     }
558 
559     return length;
560 }
561 
562 /* ### Keywords **************************************************/
563 
564 #define ULOC_KEYWORD_BUFFER_LEN 25
565 #define ULOC_MAX_NO_KEYWORDS 25
566 
567 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)568 locale_getKeywordsStart(const char *localeID) {
569     const char *result = NULL;
570     if((result = uprv_strchr(localeID, '@')) != NULL) {
571         return result;
572     }
573 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
574     else {
575         /* We do this because the @ sign is variant, and the @ sign used on one
576         EBCDIC machine won't be compiled the same way on other EBCDIC based
577         machines. */
578         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
579         const uint8_t *charToFind = ebcdicSigns;
580         while(*charToFind) {
581             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
582                 return result;
583             }
584             charToFind++;
585         }
586     }
587 #endif
588     return NULL;
589 }
590 
591 /**
592  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
593  * @param keywordName incoming name to be canonicalized
594  * @param status return status (keyword too long)
595  * @return length of the keyword name
596  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)597 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
598 {
599   int32_t i;
600   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
601 
602   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
603     /* keyword name too long for internal buffer */
604     *status = U_INTERNAL_PROGRAM_ERROR;
605           return 0;
606   }
607 
608   /* normalize the keyword name */
609   for(i = 0; i < keywordNameLen; i++) {
610     buf[i] = uprv_tolower(keywordName[i]);
611   }
612   buf[i] = 0;
613 
614   return keywordNameLen;
615 }
616 
617 typedef struct {
618     char keyword[ULOC_KEYWORD_BUFFER_LEN];
619     int32_t keywordLen;
620     const char *valueStart;
621     int32_t valueLen;
622 } KeywordStruct;
623 
624 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)625 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
626     const char* leftString = ((const KeywordStruct *)left)->keyword;
627     const char* rightString = ((const KeywordStruct *)right)->keyword;
628     return uprv_strcmp(leftString, rightString);
629 }
630 
631 /**
632  * Both addKeyword and addValue must already be in canonical form.
633  * Either both addKeyword and addValue are NULL, or neither is NULL.
634  * If they are not NULL they must be zero terminated.
635  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
636  */
637 static int32_t
_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,const char * addKeyword,const char * addValue,UErrorCode * status)638 _getKeywords(const char *localeID,
639              char prev,
640              char *keywords, int32_t keywordCapacity,
641              char *values, int32_t valuesCapacity, int32_t *valLen,
642              UBool valuesToo,
643              const char* addKeyword,
644              const char* addValue,
645              UErrorCode *status)
646 {
647     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
648 
649     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
650     int32_t numKeywords = 0;
651     const char* pos = localeID;
652     const char* equalSign = NULL;
653     const char* semicolon = NULL;
654     int32_t i = 0, j, n;
655     int32_t keywordsLen = 0;
656     int32_t valuesLen = 0;
657 
658     if(prev == '@') { /* start of keyword definition */
659         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
660         do {
661             UBool duplicate = FALSE;
662             /* skip leading spaces */
663             while(*pos == ' ') {
664                 pos++;
665             }
666             if (!*pos) { /* handle trailing "; " */
667                 break;
668             }
669             if(numKeywords == maxKeywords) {
670                 *status = U_INTERNAL_PROGRAM_ERROR;
671                 return 0;
672             }
673             equalSign = uprv_strchr(pos, '=');
674             semicolon = uprv_strchr(pos, ';');
675             /* lack of '=' [foo@currency] is illegal */
676             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
677             if(!equalSign || (semicolon && semicolon<equalSign)) {
678                 *status = U_INVALID_FORMAT_ERROR;
679                 return 0;
680             }
681             /* need to normalize both keyword and keyword name */
682             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
683                 /* keyword name too long for internal buffer */
684                 *status = U_INTERNAL_PROGRAM_ERROR;
685                 return 0;
686             }
687             for(i = 0, n = 0; i < equalSign - pos; ++i) {
688                 if (pos[i] != ' ') {
689                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
690                 }
691             }
692 
693             /* zero-length keyword is an error. */
694             if (n == 0) {
695                 *status = U_INVALID_FORMAT_ERROR;
696                 return 0;
697             }
698 
699             keywordList[numKeywords].keyword[n] = 0;
700             keywordList[numKeywords].keywordLen = n;
701             /* now grab the value part. First we skip the '=' */
702             equalSign++;
703             /* then we leading spaces */
704             while(*equalSign == ' ') {
705                 equalSign++;
706             }
707 
708             /* Premature end or zero-length value */
709             if (!*equalSign || equalSign == semicolon) {
710                 *status = U_INVALID_FORMAT_ERROR;
711                 return 0;
712             }
713 
714             keywordList[numKeywords].valueStart = equalSign;
715 
716             pos = semicolon;
717             i = 0;
718             if(pos) {
719                 while(*(pos - i - 1) == ' ') {
720                     i++;
721                 }
722                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
723                 pos++;
724             } else {
725                 i = (int32_t)uprv_strlen(equalSign);
726                 while(i && equalSign[i-1] == ' ') {
727                     i--;
728                 }
729                 keywordList[numKeywords].valueLen = i;
730             }
731             /* If this is a duplicate keyword, then ignore it */
732             for (j=0; j<numKeywords; ++j) {
733                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
734                     duplicate = TRUE;
735                     break;
736                 }
737             }
738             if (!duplicate) {
739                 ++numKeywords;
740             }
741         } while(pos);
742 
743         /* Handle addKeyword/addValue. */
744         if (addKeyword != NULL) {
745             UBool duplicate = FALSE;
746             U_ASSERT(addValue != NULL);
747             /* Search for duplicate; if found, do nothing. Explicit keyword
748                overrides addKeyword. */
749             for (j=0; j<numKeywords; ++j) {
750                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
751                     duplicate = TRUE;
752                     break;
753                 }
754             }
755             if (!duplicate) {
756                 if (numKeywords == maxKeywords) {
757                     *status = U_INTERNAL_PROGRAM_ERROR;
758                     return 0;
759                 }
760                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
761                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
762                 keywordList[numKeywords].valueStart = addValue;
763                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
764                 ++numKeywords;
765             }
766         } else {
767             U_ASSERT(addValue == NULL);
768         }
769 
770         /* now we have a list of keywords */
771         /* we need to sort it */
772         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
773 
774         /* Now construct the keyword part */
775         for(i = 0; i < numKeywords; i++) {
776             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
777                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
778                 if(valuesToo) {
779                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
780                 } else {
781                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
782                 }
783             }
784             keywordsLen += keywordList[i].keywordLen + 1;
785             if(valuesToo) {
786                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
787                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
788                 }
789                 keywordsLen += keywordList[i].valueLen;
790 
791                 if(i < numKeywords - 1) {
792                     if(keywordsLen < keywordCapacity) {
793                         keywords[keywordsLen] = ';';
794                     }
795                     keywordsLen++;
796                 }
797             }
798             if(values) {
799                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
800                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
801                     values[valuesLen + keywordList[i].valueLen] = 0;
802                 }
803                 valuesLen += keywordList[i].valueLen + 1;
804             }
805         }
806         if(values) {
807             values[valuesLen] = 0;
808             if(valLen) {
809                 *valLen = valuesLen;
810             }
811         }
812         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
813     } else {
814         return 0;
815     }
816 }
817 
818 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)819 locale_getKeywords(const char *localeID,
820                    char prev,
821                    char *keywords, int32_t keywordCapacity,
822                    char *values, int32_t valuesCapacity, int32_t *valLen,
823                    UBool valuesToo,
824                    UErrorCode *status) {
825     return _getKeywords(localeID, prev, keywords, keywordCapacity,
826                         values, valuesCapacity, valLen, valuesToo,
827                         NULL, NULL, status);
828 }
829 
830 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)831 uloc_getKeywordValue(const char* localeID,
832                      const char* keywordName,
833                      char* buffer, int32_t bufferCapacity,
834                      UErrorCode* status)
835 {
836     const char* startSearchHere = NULL;
837     const char* nextSeparator = NULL;
838     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
839     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
840     int32_t i = 0;
841     int32_t result = 0;
842 
843     if(status && U_SUCCESS(*status) && localeID) {
844       char tempBuffer[ULOC_FULLNAME_CAPACITY];
845       const char* tmpLocaleID;
846 
847       if (_hasBCP47Extension(localeID)) {
848           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
849       } else {
850           tmpLocaleID=localeID;
851       }
852 
853       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
854       if(startSearchHere == NULL) {
855           /* no keywords, return at once */
856           return 0;
857       }
858 
859       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
860       if(U_FAILURE(*status)) {
861         return 0;
862       }
863 
864       /* find the first keyword */
865       while(startSearchHere) {
866           startSearchHere++;
867           /* skip leading spaces (allowed?) */
868           while(*startSearchHere == ' ') {
869               startSearchHere++;
870           }
871           nextSeparator = uprv_strchr(startSearchHere, '=');
872           /* need to normalize both keyword and keyword name */
873           if(!nextSeparator) {
874               break;
875           }
876           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
877               /* keyword name too long for internal buffer */
878               *status = U_INTERNAL_PROGRAM_ERROR;
879               return 0;
880           }
881           for(i = 0; i < nextSeparator - startSearchHere; i++) {
882               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
883           }
884           /* trim trailing spaces */
885           while(startSearchHere[i-1] == ' ') {
886               i--;
887               U_ASSERT(i>=0);
888           }
889           localeKeywordNameBuffer[i] = 0;
890 
891           startSearchHere = uprv_strchr(nextSeparator, ';');
892 
893           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
894               nextSeparator++;
895               while(*nextSeparator == ' ') {
896                   nextSeparator++;
897               }
898               /* we actually found the keyword. Copy the value */
899               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
900                   while(*(startSearchHere-1) == ' ') {
901                       startSearchHere--;
902                   }
903                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
904                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
905               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
906                   i = (int32_t)uprv_strlen(nextSeparator);
907                   while(nextSeparator[i - 1] == ' ') {
908                       i--;
909                   }
910                   uprv_strncpy(buffer, nextSeparator, i);
911                   result = u_terminateChars(buffer, bufferCapacity, i, status);
912               } else {
913                   /* give a bigger buffer, please */
914                   *status = U_BUFFER_OVERFLOW_ERROR;
915                   if(startSearchHere) {
916                       result = (int32_t)(startSearchHere - nextSeparator);
917                   } else {
918                       result = (int32_t)uprv_strlen(nextSeparator);
919                   }
920               }
921               return result;
922           }
923       }
924     }
925     return 0;
926 }
927 
928 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)929 uloc_setKeywordValue(const char* keywordName,
930                      const char* keywordValue,
931                      char* buffer, int32_t bufferCapacity,
932                      UErrorCode* status)
933 {
934     /* TODO: sorting. removal. */
935     int32_t keywordNameLen;
936     int32_t keywordValueLen;
937     int32_t bufLen;
938     int32_t needLen = 0;
939     int32_t foundValueLen;
940     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
941     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
942     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
943     int32_t i = 0;
944     int32_t rc;
945     char* nextSeparator = NULL;
946     char* nextEqualsign = NULL;
947     char* startSearchHere = NULL;
948     char* keywordStart = NULL;
949     char *insertHere = NULL;
950     if(U_FAILURE(*status)) {
951         return -1;
952     }
953     if(bufferCapacity>1) {
954         bufLen = (int32_t)uprv_strlen(buffer);
955     } else {
956         *status = U_ILLEGAL_ARGUMENT_ERROR;
957         return 0;
958     }
959     if(bufferCapacity<bufLen) {
960         /* The capacity is less than the length?! Is this NULL terminated? */
961         *status = U_ILLEGAL_ARGUMENT_ERROR;
962         return 0;
963     }
964     if(keywordValue && !*keywordValue) {
965         keywordValue = NULL;
966     }
967     if(keywordValue) {
968         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
969     } else {
970         keywordValueLen = 0;
971     }
972     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
973     if(U_FAILURE(*status)) {
974         return 0;
975     }
976     startSearchHere = (char*)locale_getKeywordsStart(buffer);
977     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
978         if(!keywordValue) { /* no keywords = nothing to remove */
979             return bufLen;
980         }
981 
982         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
983         if(startSearchHere) { /* had a single @ */
984             needLen--; /* already had the @ */
985             /* startSearchHere points at the @ */
986         } else {
987             startSearchHere=buffer+bufLen;
988         }
989         if(needLen >= bufferCapacity) {
990             *status = U_BUFFER_OVERFLOW_ERROR;
991             return needLen; /* no change */
992         }
993         *startSearchHere = '@';
994         startSearchHere++;
995         uprv_strcpy(startSearchHere, keywordNameBuffer);
996         startSearchHere += keywordNameLen;
997         *startSearchHere = '=';
998         startSearchHere++;
999         uprv_strcpy(startSearchHere, keywordValue);
1000         startSearchHere+=keywordValueLen;
1001         return needLen;
1002     } /* end shortcut - no @ */
1003 
1004     keywordStart = startSearchHere;
1005     /* search for keyword */
1006     while(keywordStart) {
1007         keywordStart++;
1008         /* skip leading spaces (allowed?) */
1009         while(*keywordStart == ' ') {
1010             keywordStart++;
1011         }
1012         nextEqualsign = uprv_strchr(keywordStart, '=');
1013         /* need to normalize both keyword and keyword name */
1014         if(!nextEqualsign) {
1015             break;
1016         }
1017         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1018             /* keyword name too long for internal buffer */
1019             *status = U_INTERNAL_PROGRAM_ERROR;
1020             return 0;
1021         }
1022         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1023             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1024         }
1025         /* trim trailing spaces */
1026         while(keywordStart[i-1] == ' ') {
1027             i--;
1028         }
1029         U_ASSERT(i>=0 && i<ULOC_KEYWORD_BUFFER_LEN);
1030         localeKeywordNameBuffer[i] = 0;
1031 
1032         nextSeparator = uprv_strchr(nextEqualsign, ';');
1033         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1034         if(rc == 0) {
1035             nextEqualsign++;
1036             while(*nextEqualsign == ' ') {
1037                 nextEqualsign++;
1038             }
1039             /* we actually found the keyword. Change the value */
1040             if (nextSeparator) {
1041                 keywordAtEnd = 0;
1042                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1043             } else {
1044                 keywordAtEnd = 1;
1045                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1046             }
1047             if(keywordValue) { /* adding a value - not removing */
1048               if(foundValueLen == keywordValueLen) {
1049                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1050                 return bufLen; /* no change in size */
1051               } else if(foundValueLen > keywordValueLen) {
1052                 int32_t delta = foundValueLen - keywordValueLen;
1053                 if(nextSeparator) { /* RH side */
1054                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1055                 }
1056                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1057                 bufLen -= delta;
1058                 buffer[bufLen]=0;
1059                 return bufLen;
1060               } else { /* FVL < KVL */
1061                 int32_t delta = keywordValueLen - foundValueLen;
1062                 if((bufLen+delta) >= bufferCapacity) {
1063                   *status = U_BUFFER_OVERFLOW_ERROR;
1064                   return bufLen+delta;
1065                 }
1066                 if(nextSeparator) { /* RH side */
1067                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1068                 }
1069                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1070                 bufLen += delta;
1071                 buffer[bufLen]=0;
1072                 return bufLen;
1073               }
1074             } else { /* removing a keyword */
1075               if(keywordAtEnd) {
1076                 /* zero out the ';' or '@' just before startSearchhere */
1077                 keywordStart[-1] = 0;
1078                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1079               } else {
1080                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1081                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1082                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1083               }
1084             }
1085         } else if(rc<0){ /* end match keyword */
1086           /* could insert at this location. */
1087           insertHere = keywordStart;
1088         }
1089         keywordStart = nextSeparator;
1090     } /* end loop searching */
1091 
1092     if(!keywordValue) {
1093       return bufLen; /* removal of non-extant keyword - no change */
1094     }
1095 
1096     /* we know there is at least one keyword. */
1097     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1098     if(needLen >= bufferCapacity) {
1099         *status = U_BUFFER_OVERFLOW_ERROR;
1100         return needLen; /* no change */
1101     }
1102 
1103     if(insertHere) {
1104       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1105       keywordStart = insertHere;
1106     } else {
1107       keywordStart = buffer+bufLen;
1108       *keywordStart = ';';
1109       keywordStart++;
1110     }
1111     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1112     keywordStart += keywordNameLen;
1113     *keywordStart = '=';
1114     keywordStart++;
1115     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1116     keywordStart+=keywordValueLen;
1117     if(insertHere) {
1118       *keywordStart = ';';
1119       keywordStart++;
1120     }
1121     buffer[needLen]=0;
1122     return needLen;
1123 }
1124 
1125 /* ### ID parsing implementation **************************************************/
1126 
1127 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1128 
1129 /*returns TRUE if one of the special prefixes is here (s=string)
1130   'x-' or 'i-' */
1131 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1132 
1133 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1134  * except for variant
1135  */
1136 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1137 
_strnchr(const char * str,int32_t len,char c)1138 static char* _strnchr(const char* str, int32_t len, char c) {
1139     U_ASSERT(str != 0 && len >= 0);
1140     while (len-- != 0) {
1141         char d = *str;
1142         if (d == c) {
1143             return (char*) str;
1144         } else if (d == 0) {
1145             break;
1146         }
1147         ++str;
1148     }
1149     return NULL;
1150 }
1151 
1152 /**
1153  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1154  * a NULL entry, followed by more entries, and a second NULL entry.
1155  *
1156  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1157  * COUNTRIES_3.
1158  */
_findIndex(const char * const * list,const char * key)1159 static int16_t _findIndex(const char* const* list, const char* key)
1160 {
1161     const char* const* anchor = list;
1162     int32_t pass = 0;
1163 
1164     /* Make two passes through two NULL-terminated arrays at 'list' */
1165     while (pass++ < 2) {
1166         while (*list) {
1167             if (uprv_strcmp(key, *list) == 0) {
1168                 return (int16_t)(list - anchor);
1169             }
1170             list++;
1171         }
1172         ++list;     /* skip final NULL *CWB*/
1173     }
1174     return -1;
1175 }
1176 
1177 /* count the length of src while copying it to dest; return strlen(src) */
1178 static inline int32_t
_copyCount(char * dest,int32_t destCapacity,const char * src)1179 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1180     const char *anchor;
1181     char c;
1182 
1183     anchor=src;
1184     for(;;) {
1185         if((c=*src)==0) {
1186             return (int32_t)(src-anchor);
1187         }
1188         if(destCapacity<=0) {
1189             return (int32_t)((src-anchor)+uprv_strlen(src));
1190         }
1191         ++src;
1192         *dest++=c;
1193         --destCapacity;
1194     }
1195 }
1196 
1197 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1198 uloc_getCurrentCountryID(const char* oldID){
1199     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1200     if (offset >= 0) {
1201         return REPLACEMENT_COUNTRIES[offset];
1202     }
1203     return oldID;
1204 }
1205 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1206 uloc_getCurrentLanguageID(const char* oldID){
1207     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1208     if (offset >= 0) {
1209         return REPLACEMENT_LANGUAGES[offset];
1210     }
1211     return oldID;
1212 }
1213 /*
1214  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1215  * avoid duplicating code to handle the earlier locale ID pieces
1216  * in the functions for the later ones by
1217  * setting the *pEnd pointer to where they stopped parsing
1218  *
1219  * TODO try to use this in Locale
1220  */
1221 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1222 ulocimp_getLanguage(const char *localeID,
1223                     char *language, int32_t languageCapacity,
1224                     const char **pEnd) {
1225     int32_t i=0;
1226     int32_t offset;
1227     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1228 
1229     /* if it starts with i- or x- then copy that prefix */
1230     if(_isIDPrefix(localeID)) {
1231         if(i<languageCapacity) {
1232             language[i]=(char)uprv_tolower(*localeID);
1233         }
1234         if(i<languageCapacity) {
1235             language[i+1]='-';
1236         }
1237         i+=2;
1238         localeID+=2;
1239     }
1240 
1241     /* copy the language as far as possible and count its length */
1242     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1243         if(i<languageCapacity) {
1244             language[i]=(char)uprv_tolower(*localeID);
1245         }
1246         if(i<3) {
1247             U_ASSERT(i>=0);
1248             lang[i]=(char)uprv_tolower(*localeID);
1249         }
1250         i++;
1251         localeID++;
1252     }
1253 
1254     if(i==3) {
1255         /* convert 3 character code to 2 character code if possible *CWB*/
1256         offset=_findIndex(LANGUAGES_3, lang);
1257         if(offset>=0) {
1258             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1259         }
1260     }
1261 
1262     if(pEnd!=NULL) {
1263         *pEnd=localeID;
1264     }
1265     return i;
1266 }
1267 
1268 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1269 ulocimp_getScript(const char *localeID,
1270                   char *script, int32_t scriptCapacity,
1271                   const char **pEnd)
1272 {
1273     int32_t idLen = 0;
1274 
1275     if (pEnd != NULL) {
1276         *pEnd = localeID;
1277     }
1278 
1279     /* copy the second item as far as possible and count its length */
1280     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1281             && uprv_isASCIILetter(localeID[idLen])) {
1282         idLen++;
1283     }
1284 
1285     /* If it's exactly 4 characters long, then it's a script and not a country. */
1286     if (idLen == 4) {
1287         int32_t i;
1288         if (pEnd != NULL) {
1289             *pEnd = localeID+idLen;
1290         }
1291         if(idLen > scriptCapacity) {
1292             idLen = scriptCapacity;
1293         }
1294         if (idLen >= 1) {
1295             script[0]=(char)uprv_toupper(*(localeID++));
1296         }
1297         for (i = 1; i < idLen; i++) {
1298             script[i]=(char)uprv_tolower(*(localeID++));
1299         }
1300     }
1301     else {
1302         idLen = 0;
1303     }
1304     return idLen;
1305 }
1306 
1307 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1308 ulocimp_getCountry(const char *localeID,
1309                    char *country, int32_t countryCapacity,
1310                    const char **pEnd)
1311 {
1312     int32_t idLen=0;
1313     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1314     int32_t offset;
1315 
1316     /* copy the country as far as possible and count its length */
1317     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1318         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1319             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1320         }
1321         idLen++;
1322     }
1323 
1324     /* the country should be either length 2 or 3 */
1325     if (idLen == 2 || idLen == 3) {
1326         UBool gotCountry = FALSE;
1327         /* convert 3 character code to 2 character code if possible *CWB*/
1328         if(idLen==3) {
1329             offset=_findIndex(COUNTRIES_3, cnty);
1330             if(offset>=0) {
1331                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1332                 gotCountry = TRUE;
1333             }
1334         }
1335         if (!gotCountry) {
1336             int32_t i = 0;
1337             for (i = 0; i < idLen; i++) {
1338                 if (i < countryCapacity) {
1339                     country[i]=(char)uprv_toupper(localeID[i]);
1340                 }
1341             }
1342         }
1343         localeID+=idLen;
1344     } else {
1345         idLen = 0;
1346     }
1347 
1348     if(pEnd!=NULL) {
1349         *pEnd=localeID;
1350     }
1351 
1352     return idLen;
1353 }
1354 
1355 /**
1356  * @param needSeparator if true, then add leading '_' if any variants
1357  * are added to 'variant'
1358  */
1359 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1360 _getVariantEx(const char *localeID,
1361               char prev,
1362               char *variant, int32_t variantCapacity,
1363               UBool needSeparator) {
1364     int32_t i=0;
1365 
1366     /* get one or more variant tags and separate them with '_' */
1367     if(_isIDSeparator(prev)) {
1368         /* get a variant string after a '-' or '_' */
1369         while(!_isTerminator(*localeID)) {
1370             if (needSeparator) {
1371                 if (i<variantCapacity) {
1372                     variant[i] = '_';
1373                 }
1374                 ++i;
1375                 needSeparator = FALSE;
1376             }
1377             if(i<variantCapacity) {
1378                 variant[i]=(char)uprv_toupper(*localeID);
1379                 if(variant[i]=='-') {
1380                     variant[i]='_';
1381                 }
1382             }
1383             i++;
1384             localeID++;
1385         }
1386     }
1387 
1388     /* if there is no variant tag after a '-' or '_' then look for '@' */
1389     if(i==0) {
1390         if(prev=='@') {
1391             /* keep localeID */
1392         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1393             ++localeID; /* point after the '@' */
1394         } else {
1395             return 0;
1396         }
1397         while(!_isTerminator(*localeID)) {
1398             if (needSeparator) {
1399                 if (i<variantCapacity) {
1400                     variant[i] = '_';
1401                 }
1402                 ++i;
1403                 needSeparator = FALSE;
1404             }
1405             if(i<variantCapacity) {
1406                 variant[i]=(char)uprv_toupper(*localeID);
1407                 if(variant[i]=='-' || variant[i]==',') {
1408                     variant[i]='_';
1409                 }
1410             }
1411             i++;
1412             localeID++;
1413         }
1414     }
1415 
1416     return i;
1417 }
1418 
1419 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1420 _getVariant(const char *localeID,
1421             char prev,
1422             char *variant, int32_t variantCapacity) {
1423     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1424 }
1425 
1426 /**
1427  * Delete ALL instances of a variant from the given list of one or
1428  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1429  * @param variants the source string of one or more variants,
1430  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1431  * terminated; if it is, trailing zero will NOT be maintained.
1432  * @param variantsLen length of variants
1433  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1434  * or "PREEURO"; not zero terminated
1435  * @param toDeleteLen length of toDelete
1436  * @return number of characters deleted from variants
1437  */
1438 static int32_t
_deleteVariant(char * variants,int32_t variantsLen,const char * toDelete,int32_t toDeleteLen)1439 _deleteVariant(char* variants, int32_t variantsLen,
1440                const char* toDelete, int32_t toDeleteLen)
1441 {
1442     int32_t delta = 0; /* number of chars deleted */
1443     for (;;) {
1444         UBool flag = FALSE;
1445         if (variantsLen < toDeleteLen) {
1446             return delta;
1447         }
1448         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1449             (variantsLen == toDeleteLen ||
1450              (flag=(variants[toDeleteLen] == '_'))))
1451         {
1452             int32_t d = toDeleteLen + (flag?1:0);
1453             variantsLen -= d;
1454             delta += d;
1455             if (variantsLen > 0) {
1456                 uprv_memmove(variants, variants+d, variantsLen);
1457             }
1458         } else {
1459             char* p = _strnchr(variants, variantsLen, '_');
1460             if (p == NULL) {
1461                 return delta;
1462             }
1463             ++p;
1464             variantsLen -= (int32_t)(p - variants);
1465             variants = p;
1466         }
1467     }
1468 }
1469 
1470 /* Keyword enumeration */
1471 
1472 typedef struct UKeywordsContext {
1473     char* keywords;
1474     char* current;
1475 } UKeywordsContext;
1476 
1477 U_CDECL_BEGIN
1478 
1479 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1480 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1481     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1482     uprv_free(enumerator->context);
1483     uprv_free(enumerator);
1484 }
1485 
1486 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1487 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1488     char *kw = ((UKeywordsContext *)en->context)->keywords;
1489     int32_t result = 0;
1490     while(*kw) {
1491         result++;
1492         kw += uprv_strlen(kw)+1;
1493     }
1494     return result;
1495 }
1496 
1497 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1498 uloc_kw_nextKeyword(UEnumeration* en,
1499                     int32_t* resultLength,
1500                     UErrorCode* /*status*/) {
1501     const char* result = ((UKeywordsContext *)en->context)->current;
1502     int32_t len = 0;
1503     if(*result) {
1504         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1505         ((UKeywordsContext *)en->context)->current += len+1;
1506     } else {
1507         result = NULL;
1508     }
1509     if (resultLength) {
1510         *resultLength = len;
1511     }
1512     return result;
1513 }
1514 
1515 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1516 uloc_kw_resetKeywords(UEnumeration* en,
1517                       UErrorCode* /*status*/) {
1518     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1519 }
1520 
1521 U_CDECL_END
1522 
1523 
1524 static const UEnumeration gKeywordsEnum = {
1525     NULL,
1526     NULL,
1527     uloc_kw_closeKeywords,
1528     uloc_kw_countKeywords,
1529     uenum_unextDefault,
1530     uloc_kw_nextKeyword,
1531     uloc_kw_resetKeywords
1532 };
1533 
1534 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1535 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1536 {
1537     UKeywordsContext *myContext = NULL;
1538     UEnumeration *result = NULL;
1539 
1540     if(U_FAILURE(*status)) {
1541         return NULL;
1542     }
1543     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1544     /* Null pointer test */
1545     if (result == NULL) {
1546         *status = U_MEMORY_ALLOCATION_ERROR;
1547         return NULL;
1548     }
1549     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1550     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1551     if (myContext == NULL) {
1552         *status = U_MEMORY_ALLOCATION_ERROR;
1553         uprv_free(result);
1554         return NULL;
1555     }
1556     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1557     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1558     myContext->keywords[keywordListSize] = 0;
1559     myContext->current = myContext->keywords;
1560     result->context = myContext;
1561     return result;
1562 }
1563 
1564 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1565 uloc_openKeywords(const char* localeID,
1566                         UErrorCode* status)
1567 {
1568     int32_t i=0;
1569     char keywords[256];
1570     int32_t keywordsCapacity = 256;
1571     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1572     const char* tmpLocaleID;
1573 
1574     if(status==NULL || U_FAILURE(*status)) {
1575         return 0;
1576     }
1577 
1578     if (_hasBCP47Extension(localeID)) {
1579         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1580     } else {
1581         if (localeID==NULL) {
1582            localeID=uloc_getDefault();
1583         }
1584         tmpLocaleID=localeID;
1585     }
1586 
1587     /* Skip the language */
1588     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1589     if(_isIDSeparator(*tmpLocaleID)) {
1590         const char *scriptID;
1591         /* Skip the script if available */
1592         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1593         if(scriptID != tmpLocaleID+1) {
1594             /* Found optional script */
1595             tmpLocaleID = scriptID;
1596         }
1597         /* Skip the Country */
1598         if (_isIDSeparator(*tmpLocaleID)) {
1599             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1600             if(_isIDSeparator(*tmpLocaleID)) {
1601                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1602             }
1603         }
1604     }
1605 
1606     /* keywords are located after '@' */
1607     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1608         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1609     }
1610 
1611     if(i) {
1612         return uloc_openKeywordList(keywords, i, status);
1613     } else {
1614         return NULL;
1615     }
1616 }
1617 
1618 
1619 /* bit-flags for 'options' parameter of _canonicalize */
1620 #define _ULOC_STRIP_KEYWORDS 0x2
1621 #define _ULOC_CANONICALIZE   0x1
1622 
1623 #define OPTION_SET(options, mask) ((options & mask) != 0)
1624 
1625 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1626 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1627 
1628 /**
1629  * Canonicalize the given localeID, to level 1 or to level 2,
1630  * depending on the options.  To specify level 1, pass in options=0.
1631  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1632  *
1633  * This is the code underlying uloc_getName and uloc_canonicalize.
1634  */
1635 static int32_t
_canonicalize(const char * localeID,char * result,int32_t resultCapacity,uint32_t options,UErrorCode * err)1636 _canonicalize(const char* localeID,
1637               char* result,
1638               int32_t resultCapacity,
1639               uint32_t options,
1640               UErrorCode* err) {
1641     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1642     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1643     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1644     const char* origLocaleID;
1645     const char* tmpLocaleID;
1646     const char* keywordAssign = NULL;
1647     const char* separatorIndicator = NULL;
1648     const char* addKeyword = NULL;
1649     const char* addValue = NULL;
1650     char* name;
1651     char* variant = NULL; /* pointer into name, or NULL */
1652 
1653     if (U_FAILURE(*err)) {
1654         return 0;
1655     }
1656 
1657     if (_hasBCP47Extension(localeID)) {
1658         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1659     } else {
1660         if (localeID==NULL) {
1661            localeID=uloc_getDefault();
1662         }
1663         tmpLocaleID=localeID;
1664     }
1665 
1666     origLocaleID=tmpLocaleID;
1667 
1668     /* if we are doing a full canonicalization, then put results in
1669        localeBuffer, if necessary; otherwise send them to result. */
1670     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1671         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1672         name = localeBuffer;
1673         nameCapacity = (int32_t)sizeof(localeBuffer);
1674     } else {
1675         name = result;
1676         nameCapacity = resultCapacity;
1677     }
1678 
1679     /* get all pieces, one after another, and separate with '_' */
1680     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1681 
1682     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1683         const char *d = uloc_getDefault();
1684 
1685         len = (int32_t)uprv_strlen(d);
1686 
1687         if (name != NULL) {
1688             uprv_strncpy(name, d, len);
1689         }
1690     } else if(_isIDSeparator(*tmpLocaleID)) {
1691         const char *scriptID;
1692 
1693         ++fieldCount;
1694         if(len<nameCapacity) {
1695             name[len]='_';
1696         }
1697         ++len;
1698 
1699         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1700             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1701         if(scriptSize > 0) {
1702             /* Found optional script */
1703             tmpLocaleID = scriptID;
1704             ++fieldCount;
1705             len+=scriptSize;
1706             if (_isIDSeparator(*tmpLocaleID)) {
1707                 /* If there is something else, then we add the _ */
1708                 if(len<nameCapacity) {
1709                     name[len]='_';
1710                 }
1711                 ++len;
1712             }
1713         }
1714 
1715         if (_isIDSeparator(*tmpLocaleID)) {
1716             const char *cntryID;
1717             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1718                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1719             if (cntrySize > 0) {
1720                 /* Found optional country */
1721                 tmpLocaleID = cntryID;
1722                 len+=cntrySize;
1723             }
1724             if(_isIDSeparator(*tmpLocaleID)) {
1725                 /* If there is something else, then we add the _  if we found country before. */
1726                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1727                     ++fieldCount;
1728                     if(len<nameCapacity) {
1729                         name[len]='_';
1730                     }
1731                     ++len;
1732                 }
1733 
1734                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1735                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1736                 if (variantSize > 0) {
1737                     variant = len<nameCapacity ? name+len : NULL;
1738                     len += variantSize;
1739                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1740                 }
1741             }
1742         }
1743     }
1744 
1745     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1746     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1747         UBool done = FALSE;
1748         do {
1749             char c = *tmpLocaleID;
1750             switch (c) {
1751             case 0:
1752             case '@':
1753                 done = TRUE;
1754                 break;
1755             default:
1756                 if (len<nameCapacity) {
1757                     name[len] = c;
1758                 }
1759                 ++len;
1760                 ++tmpLocaleID;
1761                 break;
1762             }
1763         } while (!done);
1764     }
1765 
1766     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1767        After this, tmpLocaleID either points to '@' or is NULL */
1768     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1769         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1770         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1771     }
1772 
1773     /* Copy POSIX-style variant, if any [mr@FOO] */
1774     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1775         tmpLocaleID != NULL && keywordAssign == NULL) {
1776         for (;;) {
1777             char c = *tmpLocaleID;
1778             if (c == 0) {
1779                 break;
1780             }
1781             if (len<nameCapacity) {
1782                 name[len] = c;
1783             }
1784             ++len;
1785             ++tmpLocaleID;
1786         }
1787     }
1788 
1789     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1790         /* Handle @FOO variant if @ is present and not followed by = */
1791         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1792             int32_t posixVariantSize;
1793             /* Add missing '_' if needed */
1794             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1795                 do {
1796                     if(len<nameCapacity) {
1797                         name[len]='_';
1798                     }
1799                     ++len;
1800                     ++fieldCount;
1801                 } while(fieldCount<2);
1802             }
1803             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1804                                              (UBool)(variantSize > 0));
1805             if (posixVariantSize > 0) {
1806                 if (variant == NULL) {
1807                     variant = name+len;
1808                 }
1809                 len += posixVariantSize;
1810                 variantSize += posixVariantSize;
1811             }
1812         }
1813 
1814         /* Handle generic variants first */
1815         if (variant) {
1816             for (j=0; j<UPRV_LENGTHOF(VARIANT_MAP); j++) {
1817                 const char* variantToCompare = VARIANT_MAP[j].variant;
1818                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1819                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1820                 len -= variantLen;
1821                 if (variantLen > 0) {
1822                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1823                         --len;
1824                     }
1825                     addKeyword = VARIANT_MAP[j].keyword;
1826                     addValue = VARIANT_MAP[j].value;
1827                     break;
1828                 }
1829             }
1830             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1831                 --len;
1832             }
1833         }
1834 
1835         /* Look up the ID in the canonicalization map */
1836         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1837             const char* id = CANONICALIZE_MAP[j].id;
1838             int32_t n = (int32_t)uprv_strlen(id);
1839             if (len == n && uprv_strncmp(name, id, n) == 0) {
1840                 if (n == 0 && tmpLocaleID != NULL) {
1841                     break; /* Don't remap "" if keywords present */
1842                 }
1843                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1844                 if (CANONICALIZE_MAP[j].keyword) {
1845                     addKeyword = CANONICALIZE_MAP[j].keyword;
1846                     addValue = CANONICALIZE_MAP[j].value;
1847                 }
1848                 break;
1849             }
1850         }
1851     }
1852 
1853     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1854         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1855             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1856             if(len<nameCapacity) {
1857                 name[len]='@';
1858             }
1859             ++len;
1860             ++fieldCount;
1861             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1862                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1863         } else if (addKeyword != NULL) {
1864             U_ASSERT(addValue != NULL && len < nameCapacity);
1865             /* inelegant but works -- later make _getKeywords do this? */
1866             len += _copyCount(name+len, nameCapacity-len, "@");
1867             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1868             len += _copyCount(name+len, nameCapacity-len, "=");
1869             len += _copyCount(name+len, nameCapacity-len, addValue);
1870         }
1871     }
1872 
1873     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1874         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1875     }
1876 
1877     return u_terminateChars(result, resultCapacity, len, err);
1878 }
1879 
1880 /* ### ID parsing API **************************************************/
1881 
1882 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1883 uloc_getParent(const char*    localeID,
1884                char* parent,
1885                int32_t parentCapacity,
1886                UErrorCode* err)
1887 {
1888     const char *lastUnderscore;
1889     int32_t i;
1890 
1891     if (U_FAILURE(*err))
1892         return 0;
1893 
1894     if (localeID == NULL)
1895         localeID = uloc_getDefault();
1896 
1897     lastUnderscore=uprv_strrchr(localeID, '_');
1898     if(lastUnderscore!=NULL) {
1899         i=(int32_t)(lastUnderscore-localeID);
1900     } else {
1901         i=0;
1902     }
1903 
1904     if(i>0 && parent != localeID) {
1905         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1906     }
1907     return u_terminateChars(parent, parentCapacity, i, err);
1908 }
1909 
1910 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1911 uloc_getLanguage(const char*    localeID,
1912          char* language,
1913          int32_t languageCapacity,
1914          UErrorCode* err)
1915 {
1916     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1917     int32_t i=0;
1918 
1919     if (err==NULL || U_FAILURE(*err)) {
1920         return 0;
1921     }
1922 
1923     if(localeID==NULL) {
1924         localeID=uloc_getDefault();
1925     }
1926 
1927     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1928     return u_terminateChars(language, languageCapacity, i, err);
1929 }
1930 
1931 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1932 uloc_getScript(const char*    localeID,
1933          char* script,
1934          int32_t scriptCapacity,
1935          UErrorCode* err)
1936 {
1937     int32_t i=0;
1938 
1939     if(err==NULL || U_FAILURE(*err)) {
1940         return 0;
1941     }
1942 
1943     if(localeID==NULL) {
1944         localeID=uloc_getDefault();
1945     }
1946 
1947     /* skip the language */
1948     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1949     if(_isIDSeparator(*localeID)) {
1950         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1951     }
1952     return u_terminateChars(script, scriptCapacity, i, err);
1953 }
1954 
1955 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1956 uloc_getCountry(const char* localeID,
1957             char* country,
1958             int32_t countryCapacity,
1959             UErrorCode* err)
1960 {
1961     int32_t i=0;
1962 
1963     if(err==NULL || U_FAILURE(*err)) {
1964         return 0;
1965     }
1966 
1967     if(localeID==NULL) {
1968         localeID=uloc_getDefault();
1969     }
1970 
1971     /* Skip the language */
1972     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1973     if(_isIDSeparator(*localeID)) {
1974         const char *scriptID;
1975         /* Skip the script if available */
1976         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1977         if(scriptID != localeID+1) {
1978             /* Found optional script */
1979             localeID = scriptID;
1980         }
1981         if(_isIDSeparator(*localeID)) {
1982             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1983         }
1984     }
1985     return u_terminateChars(country, countryCapacity, i, err);
1986 }
1987 
1988 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1989 uloc_getVariant(const char* localeID,
1990                 char* variant,
1991                 int32_t variantCapacity,
1992                 UErrorCode* err)
1993 {
1994     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1995     const char* tmpLocaleID;
1996     int32_t i=0;
1997 
1998     if(err==NULL || U_FAILURE(*err)) {
1999         return 0;
2000     }
2001 
2002     if (_hasBCP47Extension(localeID)) {
2003         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
2004     } else {
2005         if (localeID==NULL) {
2006            localeID=uloc_getDefault();
2007         }
2008         tmpLocaleID=localeID;
2009     }
2010 
2011     /* Skip the language */
2012     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2013     if(_isIDSeparator(*tmpLocaleID)) {
2014         const char *scriptID;
2015         /* Skip the script if available */
2016         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2017         if(scriptID != tmpLocaleID+1) {
2018             /* Found optional script */
2019             tmpLocaleID = scriptID;
2020         }
2021         /* Skip the Country */
2022         if (_isIDSeparator(*tmpLocaleID)) {
2023             const char *cntryID;
2024             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2025             if (cntryID != tmpLocaleID+1) {
2026                 /* Found optional country */
2027                 tmpLocaleID = cntryID;
2028             }
2029             if(_isIDSeparator(*tmpLocaleID)) {
2030                 /* If there was no country ID, skip a possible extra IDSeparator */
2031                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2032                     tmpLocaleID++;
2033                 }
2034                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2035             }
2036         }
2037     }
2038 
2039     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2040     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2041 /*
2042     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2043         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2044     }
2045 */
2046     return u_terminateChars(variant, variantCapacity, i, err);
2047 }
2048 
2049 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2050 uloc_getName(const char* localeID,
2051              char* name,
2052              int32_t nameCapacity,
2053              UErrorCode* err)
2054 {
2055     return _canonicalize(localeID, name, nameCapacity, 0, err);
2056 }
2057 
2058 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2059 uloc_getBaseName(const char* localeID,
2060                  char* name,
2061                  int32_t nameCapacity,
2062                  UErrorCode* err)
2063 {
2064     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2065 }
2066 
2067 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2068 uloc_canonicalize(const char* localeID,
2069                   char* name,
2070                   int32_t nameCapacity,
2071                   UErrorCode* err)
2072 {
2073     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2074 }
2075 
2076 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)2077 uloc_getISO3Language(const char* localeID)
2078 {
2079     int16_t offset;
2080     char lang[ULOC_LANG_CAPACITY];
2081     UErrorCode err = U_ZERO_ERROR;
2082 
2083     if (localeID == NULL)
2084     {
2085         localeID = uloc_getDefault();
2086     }
2087     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2088     if (U_FAILURE(err))
2089         return "";
2090     offset = _findIndex(LANGUAGES, lang);
2091     if (offset < 0)
2092         return "";
2093     return LANGUAGES_3[offset];
2094 }
2095 
2096 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)2097 uloc_getISO3Country(const char* localeID)
2098 {
2099     int16_t offset;
2100     char cntry[ULOC_LANG_CAPACITY];
2101     UErrorCode err = U_ZERO_ERROR;
2102 
2103     if (localeID == NULL)
2104     {
2105         localeID = uloc_getDefault();
2106     }
2107     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2108     if (U_FAILURE(err))
2109         return "";
2110     offset = _findIndex(COUNTRIES, cntry);
2111     if (offset < 0)
2112         return "";
2113 
2114     return COUNTRIES_3[offset];
2115 }
2116 
2117 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2118 uloc_getLCID(const char* localeID)
2119 {
2120     UErrorCode status = U_ZERO_ERROR;
2121     char       langID[ULOC_FULLNAME_CAPACITY];
2122 
2123     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2124     if (U_FAILURE(status)) {
2125         return 0;
2126     }
2127 
2128     if (uprv_strchr(localeID, '@')) {
2129         // uprv_convertToLCID does not support keywords other than collation.
2130         // Remove all keywords except collation.
2131         int32_t len;
2132         char collVal[ULOC_KEYWORDS_CAPACITY];
2133         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2134 
2135         len = uloc_getKeywordValue(localeID, "collation", collVal,
2136             UPRV_LENGTHOF(collVal) - 1, &status);
2137 
2138         if (U_SUCCESS(status) && len > 0) {
2139             collVal[len] = 0;
2140 
2141             len = uloc_getBaseName(localeID, tmpLocaleID,
2142                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2143 
2144             if (U_SUCCESS(status) && len > 0) {
2145                 tmpLocaleID[len] = 0;
2146 
2147                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2148                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2149 
2150                 if (U_SUCCESS(status) && len > 0) {
2151                     tmpLocaleID[len] = 0;
2152                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2153                 }
2154             }
2155         }
2156 
2157         // fall through - all keywords are simply ignored
2158         status = U_ZERO_ERROR;
2159     }
2160 
2161     return uprv_convertToLCID(langID, localeID, &status);
2162 }
2163 
2164 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2165 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2166                 UErrorCode *status)
2167 {
2168     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2169 }
2170 
2171 /* ### Default locale **************************************************/
2172 
2173 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2174 uloc_getDefault()
2175 {
2176     return locale_get_default();
2177 }
2178 
2179 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2180 uloc_setDefault(const char*   newDefaultLocale,
2181              UErrorCode* err)
2182 {
2183     if (U_FAILURE(*err))
2184         return;
2185     /* the error code isn't currently used for anything by this function*/
2186 
2187     /* propagate change to C++ */
2188     locale_set_default(newDefaultLocale);
2189 }
2190 
2191 /**
2192  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2193  * to an array of pointers to arrays of char.  All of these pointers are owned
2194  * by ICU-- do not delete them, and do not write through them.  The array is
2195  * terminated with a null pointer.
2196  */
2197 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2198 uloc_getISOLanguages()
2199 {
2200     return LANGUAGES;
2201 }
2202 
2203 /**
2204  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2205  * pointer to an array of pointers to arrays of char.  All of these pointers are
2206  * owned by ICU-- do not delete them, and do not write through them.  The array is
2207  * terminated with a null pointer.
2208  */
2209 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2210 uloc_getISOCountries()
2211 {
2212     return COUNTRIES;
2213 }
2214 
2215 
2216 /* this function to be moved into cstring.c later */
2217 static char gDecimal = 0;
2218 
2219 static /* U_CAPI */
2220 double
2221 /* U_EXPORT2 */
_uloc_strtod(const char * start,char ** end)2222 _uloc_strtod(const char *start, char **end) {
2223     char *decimal;
2224     char *myEnd;
2225     char buf[30];
2226     double rv;
2227     if (!gDecimal) {
2228         char rep[5];
2229         /* For machines that decide to change the decimal on you,
2230         and try to be too smart with localization.
2231         This normally should be just a '.'. */
2232         sprintf(rep, "%+1.1f", 1.0);
2233         gDecimal = rep[2];
2234     }
2235 
2236     if(gDecimal == '.') {
2237         return uprv_strtod(start, end); /* fall through to OS */
2238     } else {
2239         uprv_strncpy(buf, start, 29);
2240         buf[29]=0;
2241         decimal = uprv_strchr(buf, '.');
2242         if(decimal) {
2243             *decimal = gDecimal;
2244         } else {
2245             return uprv_strtod(start, end); /* no decimal point */
2246         }
2247         rv = uprv_strtod(buf, &myEnd);
2248         if(end) {
2249             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2250         }
2251         return rv;
2252     }
2253 }
2254 
2255 typedef struct {
2256     float q;
2257     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2258     char locale[ULOC_FULLNAME_CAPACITY+1];
2259 } _acceptLangItem;
2260 
2261 static int32_t U_CALLCONV
uloc_acceptLanguageCompare(const void *,const void * a,const void * b)2262 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2263 {
2264     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2265     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2266 
2267     int32_t rc = 0;
2268     if(bb->q < aa->q) {
2269         rc = -1;  /* A > B */
2270     } else if(bb->q > aa->q) {
2271         rc = 1;   /* A < B */
2272     } else {
2273         rc = 0;   /* A = B */
2274     }
2275 
2276     if(rc==0) {
2277         rc = uprv_stricmp(aa->locale, bb->locale);
2278     }
2279 
2280 #if defined(ULOC_DEBUG)
2281     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2282     aa->locale, aa->q,
2283     bb->locale, bb->q,
2284     rc);*/
2285 #endif
2286 
2287     return rc;
2288 }
2289 
2290 /*
2291 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2292 */
2293 
2294 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char * httpAcceptLanguage,UEnumeration * availableLocales,UErrorCode * status)2295 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2296                             const char *httpAcceptLanguage,
2297                             UEnumeration* availableLocales,
2298                             UErrorCode *status)
2299 {
2300   MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2301     char tmp[ULOC_FULLNAME_CAPACITY +1];
2302     int32_t n = 0;
2303     const char *itemEnd;
2304     const char *paramEnd;
2305     const char *s;
2306     const char *t;
2307     int32_t res;
2308     int32_t i;
2309     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2310 
2311     if(U_FAILURE(*status)) {
2312         return -1;
2313     }
2314 
2315     for(s=httpAcceptLanguage;s&&*s;) {
2316         while(isspace(*s)) /* eat space at the beginning */
2317             s++;
2318         itemEnd=uprv_strchr(s,',');
2319         paramEnd=uprv_strchr(s,';');
2320         if(!itemEnd) {
2321             itemEnd = httpAcceptLanguage+l; /* end of string */
2322         }
2323         if(paramEnd && paramEnd<itemEnd) {
2324             /* semicolon (;) is closer than end (,) */
2325             t = paramEnd+1;
2326             if(*t=='q') {
2327                 t++;
2328             }
2329             while(isspace(*t)) {
2330                 t++;
2331             }
2332             if(*t=='=') {
2333                 t++;
2334             }
2335             while(isspace(*t)) {
2336                 t++;
2337             }
2338             items[n].q = (float)_uloc_strtod(t,NULL);
2339         } else {
2340             /* no semicolon - it's 1.0 */
2341             items[n].q = 1.0f;
2342             paramEnd = itemEnd;
2343         }
2344         items[n].dummy=0;
2345         /* eat spaces prior to semi */
2346         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2347             ;
2348         int32_t slen = ((t+1)-s);
2349         if(slen > ULOC_FULLNAME_CAPACITY) {
2350           *status = U_BUFFER_OVERFLOW_ERROR;
2351           return -1; // too big
2352         }
2353         uprv_strncpy(items[n].locale, s, slen);
2354         items[n].locale[slen]=0; // terminate
2355         int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2356         if(U_FAILURE(*status)) return -1;
2357         if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2358             // canonicalization had an effect- copy back
2359             uprv_strncpy(items[n].locale, tmp, clen);
2360             items[n].locale[clen] = 0; // terminate
2361         }
2362 #if defined(ULOC_DEBUG)
2363         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2364 #endif
2365         n++;
2366         s = itemEnd;
2367         while(*s==',') { /* eat duplicate commas */
2368             s++;
2369         }
2370         if(n>=items.getCapacity()) { // If we need more items
2371           if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2372               *status = U_MEMORY_ALLOCATION_ERROR;
2373               return -1;
2374           }
2375 #if defined(ULOC_DEBUG)
2376           fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2377 #endif
2378         }
2379     }
2380     uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2381     if (U_FAILURE(*status)) {
2382         return -1;
2383     }
2384     LocalMemory<const char*> strs(NULL);
2385     if (strs.allocateInsteadAndReset(n) == NULL) {
2386         *status = U_MEMORY_ALLOCATION_ERROR;
2387         return -1;
2388     }
2389     for(i=0;i<n;i++) {
2390 #if defined(ULOC_DEBUG)
2391         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2392 #endif
2393         strs[i]=items[i].locale;
2394     }
2395     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2396                                strs.getAlias(), n, availableLocales, status);
2397     return res;
2398 }
2399 
2400 
2401 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char ** acceptList,int32_t acceptListCount,UEnumeration * availableLocales,UErrorCode * status)2402 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2403                     UAcceptResult *outResult, const char **acceptList,
2404                     int32_t acceptListCount,
2405                     UEnumeration* availableLocales,
2406                     UErrorCode *status)
2407 {
2408     int32_t i,j;
2409     int32_t len;
2410     int32_t maxLen=0;
2411     char tmp[ULOC_FULLNAME_CAPACITY+1];
2412     const char *l;
2413     char **fallbackList;
2414     if(U_FAILURE(*status)) {
2415         return -1;
2416     }
2417     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2418     if(fallbackList==NULL) {
2419         *status = U_MEMORY_ALLOCATION_ERROR;
2420         return -1;
2421     }
2422     for(i=0;i<acceptListCount;i++) {
2423 #if defined(ULOC_DEBUG)
2424         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2425 #endif
2426         while((l=uenum_next(availableLocales, NULL, status))) {
2427 #if defined(ULOC_DEBUG)
2428             fprintf(stderr,"  %s\n", l);
2429 #endif
2430             len = (int32_t)uprv_strlen(l);
2431             if(!uprv_strcmp(acceptList[i], l)) {
2432                 if(outResult) {
2433                     *outResult = ULOC_ACCEPT_VALID;
2434                 }
2435 #if defined(ULOC_DEBUG)
2436                 fprintf(stderr, "MATCH! %s\n", l);
2437 #endif
2438                 if(len>0) {
2439                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2440                 }
2441                 for(j=0;j<i;j++) {
2442                     uprv_free(fallbackList[j]);
2443                 }
2444                 uprv_free(fallbackList);
2445                 return u_terminateChars(result, resultAvailable, len, status);
2446             }
2447             if(len>maxLen) {
2448                 maxLen = len;
2449             }
2450         }
2451         uenum_reset(availableLocales, status);
2452         /* save off parent info */
2453         if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2454             fallbackList[i] = uprv_strdup(tmp);
2455         } else {
2456             fallbackList[i]=0;
2457         }
2458     }
2459 
2460     for(maxLen--;maxLen>0;maxLen--) {
2461         for(i=0;i<acceptListCount;i++) {
2462             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2463 #if defined(ULOC_DEBUG)
2464                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2465 #endif
2466                 while((l=uenum_next(availableLocales, NULL, status))) {
2467 #if defined(ULOC_DEBUG)
2468                     fprintf(stderr,"  %s\n", l);
2469 #endif
2470                     len = (int32_t)uprv_strlen(l);
2471                     if(!uprv_strcmp(fallbackList[i], l)) {
2472                         if(outResult) {
2473                             *outResult = ULOC_ACCEPT_FALLBACK;
2474                         }
2475 #if defined(ULOC_DEBUG)
2476                         fprintf(stderr, "fallback MATCH! %s\n", l);
2477 #endif
2478                         if(len>0) {
2479                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2480                         }
2481                         for(j=0;j<acceptListCount;j++) {
2482                             uprv_free(fallbackList[j]);
2483                         }
2484                         uprv_free(fallbackList);
2485                         return u_terminateChars(result, resultAvailable, len, status);
2486                     }
2487                 }
2488                 uenum_reset(availableLocales, status);
2489 
2490                 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2491                     uprv_free(fallbackList[i]);
2492                     fallbackList[i] = uprv_strdup(tmp);
2493                 } else {
2494                     uprv_free(fallbackList[i]);
2495                     fallbackList[i]=0;
2496                 }
2497             }
2498         }
2499         if(outResult) {
2500             *outResult = ULOC_ACCEPT_FAILED;
2501         }
2502     }
2503     for(i=0;i<acceptListCount;i++) {
2504         uprv_free(fallbackList[i]);
2505     }
2506     uprv_free(fallbackList);
2507     return -1;
2508 }
2509 
2510 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2511 uloc_toUnicodeLocaleKey(const char* keyword)
2512 {
2513     const char* bcpKey = ulocimp_toBcpKey(keyword);
2514     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2515         // unknown keyword, but syntax is fine..
2516         return keyword;
2517     }
2518     return bcpKey;
2519 }
2520 
2521 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2522 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2523 {
2524     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2525     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2526         // unknown keyword, but syntax is fine..
2527         return value;
2528     }
2529     return bcpType;
2530 }
2531 
2532 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
2533 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
2534 
2535 static UBool
isWellFormedLegacyKey(const char * legacyKey)2536 isWellFormedLegacyKey(const char* legacyKey)
2537 {
2538     const char* p = legacyKey;
2539     while (*p) {
2540         if (!UPRV_ISALPHANUM(*p)) {
2541             return FALSE;
2542         }
2543         p++;
2544     }
2545     return TRUE;
2546 }
2547 
2548 static UBool
isWellFormedLegacyType(const char * legacyType)2549 isWellFormedLegacyType(const char* legacyType)
2550 {
2551     const char* p = legacyType;
2552     int32_t alphaNumLen = 0;
2553     while (*p) {
2554         if (*p == '_' || *p == '/' || *p == '-') {
2555             if (alphaNumLen == 0) {
2556                 return FALSE;
2557             }
2558             alphaNumLen = 0;
2559         } else if (UPRV_ISALPHANUM(*p)) {
2560             alphaNumLen++;
2561         } else {
2562             return FALSE;
2563         }
2564         p++;
2565     }
2566     return (alphaNumLen != 0);
2567 }
2568 
2569 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2570 uloc_toLegacyKey(const char* keyword)
2571 {
2572     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2573     if (legacyKey == NULL) {
2574         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2575         //
2576         // Note:
2577         //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2578         //  However, a key should not contain '=' obviously. For now, all existing
2579         //  keys are using ASCII alphabetic letters only. We won't add any new key
2580         //  that is not compatible with the BCP 47 syntax. Therefore, we assume
2581         //  a valid key consist from [0-9a-zA-Z], no symbols.
2582         if (isWellFormedLegacyKey(keyword)) {
2583             return keyword;
2584         }
2585     }
2586     return legacyKey;
2587 }
2588 
2589 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2590 uloc_toLegacyType(const char* keyword, const char* value)
2591 {
2592     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2593     if (legacyType == NULL) {
2594         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2595         //
2596         // Note:
2597         //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2598         //  However, a type should not contain '=' obviously. For now, all existing
2599         //  types are using ASCII alphabetic letters with a few symbol letters. We won't
2600         //  add any new type that is not compatible with the BCP 47 syntax except timezone
2601         //  IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain
2602         //  '-' '_' '/' in the middle.
2603         if (isWellFormedLegacyType(value)) {
2604             return value;
2605         }
2606     }
2607     return legacyType;
2608 }
2609 
2610 /*eof*/
2611