1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1996-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * Provides functionality for mapping between
10 * LCID and Posix IDs or ICU locale to codepage
11 *
12 * Note: All classes and code in this file are
13 * intended for internal use only.
14 *
15 * Methods of interest:
16 * unsigned long convertToLCID(const char*);
17 * const char* convertToPosix(unsigned long);
18 *
19 * Kathleen Wilson, 4/30/96
20 *
21 * Date Name Description
22 * 3/11/97 aliu Fixed off-by-one bug in assignment operator. Added
23 * setId() method and safety check against
24 * MAX_ID_LENGTH.
25 * 04/23/99 stephen Added C wrapper for convertToPosix.
26 * 09/18/00 george Removed the memory leaks.
27 * 08/23/01 george Convert to C
28 */
29
30 #include "locmap.h"
31 #include "charstr.h"
32 #include "cstring.h"
33 #include "cmemory.h"
34 #include "ulocimp.h"
35 #include "unicode/uloc.h"
36
37 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
38 #include <windows.h>
39 #include <winnls.h> // LCIDToLocaleName and LocaleNameToLCID
40 #endif
41
42 /*
43 * Note:
44 * The mapping from Win32 locale ID numbers to POSIX locale strings should
45 * be the faster one.
46 *
47 * Windows LCIDs are defined at https://msdn.microsoft.com/en-us/library/cc233965.aspx
48 * [MS-LCID] Windows Language Code Identifier (LCID) Reference
49 */
50
51 namespace {
52
53 /*
54 ////////////////////////////////////////////////
55 //
56 // Internal Classes for LCID <--> POSIX Mapping
57 //
58 /////////////////////////////////////////////////
59 */
60
61 typedef struct ILcidPosixElement
62 {
63 const uint32_t hostID;
64 const char * const posixID;
65 } ILcidPosixElement;
66
67 typedef struct ILcidPosixMap
68 {
69 const uint32_t numRegions;
70 const struct ILcidPosixElement* const regionMaps;
71 } ILcidPosixMap;
72
73
74 /*
75 /////////////////////////////////////////////////
76 //
77 // Easy macros to make the LCID <--> POSIX Mapping
78 //
79 /////////////////////////////////////////////////
80 */
81
82 /**
83 * The standard one language/one country mapping for LCID.
84 * The first element must be the language, and the following
85 * elements are the language with the country.
86 * @param hostID LCID in host format such as 0x044d
87 * @param languageID posix ID of just the language such as 'de'
88 * @param posixID posix ID of the language_TERRITORY such as 'de_CH'
89 */
90 #define ILCID_POSIX_ELEMENT_ARRAY(hostID, languageID, posixID) \
91 constexpr ILcidPosixElement locmap_ ## languageID [] = { \
92 {LANGUAGE_LCID(hostID), #languageID}, /* parent locale */ \
93 {hostID, #posixID}, \
94 };
95
96 /**
97 * Define a subtable by ID
98 * @param id the POSIX ID, either a language or language_TERRITORY
99 */
100 #define ILCID_POSIX_SUBTABLE(id) \
101 constexpr ILcidPosixElement locmap_ ## id [] =
102
103
104 /**
105 * Create the map for the posixID. This macro supposes that the language string
106 * name is the same as the global variable name, and that the first element
107 * in the ILcidPosixElement is just the language.
108 * @param _posixID the full POSIX ID for this entry.
109 */
110 #define ILCID_POSIX_MAP(_posixID) \
111 {UPRV_LENGTHOF(locmap_ ## _posixID), locmap_ ## _posixID}
112
113 /*
114 ////////////////////////////////////////////
115 //
116 // Create the table of LCID to POSIX Mapping
117 // None of it should be dynamically created.
118 //
119 // Keep static locale variables inside the function so that
120 // it can be created properly during static init.
121 //
122 // Note: This table should be updated periodically. Check the [MS-LCID] Windows Language Code Identifier
123 // (LCID) Reference defined at https://msdn.microsoft.com/en-us/library/cc233965.aspx
124 //
125 // Microsoft is moving away from LCID in favor of locale name as of Vista. This table needs to be
126 // maintained for support of older Windows version.
127 // Update: Windows 7 (091130)
128 //
129 // Note: Microsoft assign a different LCID if a locale has a sorting variant. POSIX IDs below may contain
130 // @collation=XXX, but no other keywords are allowed (at least for now). When uprv_convertToLCID() is
131 // called from uloc_getLCID(), keywords other than collation are already removed. If we really need
132 // to support other keywords in this mapping data, we must update the implementation.
133 ////////////////////////////////////////////
134 */
135
136 // TODO: For Windows ideally this table would be a list of exceptions rather than a complete list as
137 // LocaleNameToLCID and LCIDToLocaleName provide 90% of these.
138
139 ILCID_POSIX_ELEMENT_ARRAY(0x0436, af, af_ZA)
140
ILCID_POSIX_SUBTABLE(ar)141 ILCID_POSIX_SUBTABLE(ar) {
142 {0x01, "ar"},
143 {0x3801, "ar_AE"},
144 {0x3c01, "ar_BH"},
145 {0x1401, "ar_DZ"},
146 {0x0c01, "ar_EG"},
147 {0x0801, "ar_IQ"},
148 {0x2c01, "ar_JO"},
149 {0x3401, "ar_KW"},
150 {0x3001, "ar_LB"},
151 {0x1001, "ar_LY"},
152 {0x1801, "ar_MA"},
153 {0x1801, "ar_MO"},
154 {0x2001, "ar_OM"},
155 {0x4001, "ar_QA"},
156 {0x0401, "ar_SA"},
157 {0x2801, "ar_SY"},
158 {0x1c01, "ar_TN"},
159 {0x2401, "ar_YE"}
160 };
161
162 ILCID_POSIX_ELEMENT_ARRAY(0x044d, as, as_IN)
163 ILCID_POSIX_ELEMENT_ARRAY(0x045e, am, am_ET)
164 ILCID_POSIX_ELEMENT_ARRAY(0x047a, arn,arn_CL)
165
ILCID_POSIX_SUBTABLE(az)166 ILCID_POSIX_SUBTABLE(az) {
167 {0x2c, "az"},
168 {0x082c, "az_Cyrl_AZ"}, /* Cyrillic based */
169 {0x742c, "az_Cyrl"}, /* Cyrillic based */
170 {0x042c, "az_Latn_AZ"}, /* Latin based */
171 {0x782c, "az_Latn"}, /* Latin based */
172 {0x042c, "az_AZ"} /* Latin based */
173 };
174
175 ILCID_POSIX_ELEMENT_ARRAY(0x046d, ba, ba_RU)
176 ILCID_POSIX_ELEMENT_ARRAY(0x0423, be, be_BY)
177
178 /*ILCID_POSIX_SUBTABLE(ber) {
179 {0x5f, "ber"},
180 {0x045f, "ber_Arab_DZ"},
181 {0x045f, "ber_Arab"},
182 {0x085f, "ber_Latn_DZ"},
183 {0x085f, "ber_Latn"}
184 };*/
185
186 ILCID_POSIX_ELEMENT_ARRAY(0x0402, bg, bg_BG)
187
ILCID_POSIX_SUBTABLE(bin)188 ILCID_POSIX_SUBTABLE(bin) {
189 {0x66, "bin"},
190 {0x0466, "bin_NG"}
191 };
192
ILCID_POSIX_SUBTABLE(bn)193 ILCID_POSIX_SUBTABLE(bn) {
194 {0x45, "bn"},
195 {0x0845, "bn_BD"},
196 {0x0445, "bn_IN"}
197 };
198
ILCID_POSIX_SUBTABLE(bo)199 ILCID_POSIX_SUBTABLE(bo) {
200 {0x51, "bo"},
201 {0x0851, "bo_BT"},
202 {0x0451, "bo_CN"},
203 {0x0c51, "dz_BT"}
204 };
205
206 ILCID_POSIX_ELEMENT_ARRAY(0x047e, br, br_FR)
207
ILCID_POSIX_SUBTABLE(ca)208 ILCID_POSIX_SUBTABLE(ca) {
209 {0x03, "ca"},
210 {0x0403, "ca_ES"},
211 {0x0803, "ca_ES_VALENCIA"}
212 };
213
214 ILCID_POSIX_ELEMENT_ARRAY(0x0483, co, co_FR)
215
ILCID_POSIX_SUBTABLE(chr)216 ILCID_POSIX_SUBTABLE(chr) {
217 {0x05c, "chr"},
218 {0x7c5c, "chr_Cher"},
219 {0x045c, "chr_Cher_US"},
220 {0x045c, "chr_US"}
221 };
222
223 // ICU has chosen different names for these.
ILCID_POSIX_SUBTABLE(ckb)224 ILCID_POSIX_SUBTABLE(ckb) {
225 {0x92, "ckb"},
226 {0x7c92, "ckb_Arab"},
227 {0x0492, "ckb_Arab_IQ"}
228 };
229
230 /* Declared as cs_CZ to get around compiler errors on z/OS, which defines cs as a function */
231 ILCID_POSIX_ELEMENT_ARRAY(0x0405, cs, cs_CZ)
232
233 ILCID_POSIX_ELEMENT_ARRAY(0x0452, cy, cy_GB)
234 ILCID_POSIX_ELEMENT_ARRAY(0x0406, da, da_DK)
235
236 // Windows doesn't know POSIX or BCP47 Unicode phonebook sort names
ILCID_POSIX_SUBTABLE(de)237 ILCID_POSIX_SUBTABLE(de) {
238 {0x07, "de"},
239 {0x0c07, "de_AT"},
240 {0x0807, "de_CH"},
241 {0x0407, "de_DE"},
242 {0x1407, "de_LI"},
243 {0x1007, "de_LU"},
244 {0x10407,"de_DE@collation=phonebook"}, /*This is really de_DE_PHONEBOOK on Windows*/
245 {0x10407,"de@collation=phonebook"} /*This is really de_DE_PHONEBOOK on Windows*/
246 };
247
248 ILCID_POSIX_ELEMENT_ARRAY(0x0465, dv, dv_MV)
249 ILCID_POSIX_ELEMENT_ARRAY(0x0408, el, el_GR)
250
251 // Windows uses an empty string for 'invariant'
ILCID_POSIX_SUBTABLE(en)252 ILCID_POSIX_SUBTABLE(en) {
253 {0x09, "en"},
254 {0x0c09, "en_AU"},
255 {0x2809, "en_BZ"},
256 {0x1009, "en_CA"},
257 {0x0809, "en_GB"},
258 {0x3c09, "en_HK"},
259 {0x3809, "en_ID"},
260 {0x1809, "en_IE"},
261 {0x4009, "en_IN"},
262 {0x2009, "en_JM"},
263 {0x4409, "en_MY"},
264 {0x1409, "en_NZ"},
265 {0x3409, "en_PH"},
266 {0x4809, "en_SG"},
267 {0x2C09, "en_TT"},
268 {0x0409, "en_US"},
269 {0x007f, "en_US_POSIX"}, /* duplicate for round-tripping */
270 {0x2409, "en_029"},
271 {0x1c09, "en_ZA"},
272 {0x3009, "en_ZW"},
273 {0x2409, "en_VI"}, /* Virgin Islands AKA Caribbean Islands (en_CB). On Windows8+ This is 0x1000 or dynamically assigned */
274 {0x0409, "en_AS"}, /* Alias for en_US. Leave last. On Windows8+ This is 0x1000 or dynamically assigned */
275 {0x0409, "en_GU"}, /* Alias for en_US. Leave last. On Windows8+ This is 0x1000 or dynamically assigned */
276 {0x0409, "en_MH"}, /* Alias for en_US. Leave last. On Windows8+ This is 0x1000 or dynamically assigned */
277 {0x0409, "en_MP"}, /* Alias for en_US. Leave last. On Windows8+ This is 0x1000 or dynamically assigned */
278 {0x0409, "en_UM"} /* Alias for en_US. Leave last. On Windows8+ This is 0x1000 or dynamically assigned */
279 };
280
ILCID_POSIX_SUBTABLE(en_US_POSIX)281 ILCID_POSIX_SUBTABLE(en_US_POSIX) {
282 {0x007f, "en_US_POSIX"} /* duplicate for roundtripping */
283 };
284
285 // Windows doesn't know POSIX or BCP47 Unicode traditional sort names
ILCID_POSIX_SUBTABLE(es)286 ILCID_POSIX_SUBTABLE(es) {
287 {0x0a, "es"},
288 {0x2c0a, "es_AR"},
289 {0x400a, "es_BO"},
290 {0x340a, "es_CL"},
291 {0x240a, "es_CO"},
292 {0x140a, "es_CR"},
293 {0x5c0a, "es_CU"},
294 {0x1c0a, "es_DO"},
295 {0x300a, "es_EC"},
296 {0x0c0a, "es_ES"}, /*Modern sort.*/
297 {0x100a, "es_GT"},
298 {0x480a, "es_HN"},
299 {0x080a, "es_MX"},
300 {0x4c0a, "es_NI"},
301 {0x180a, "es_PA"},
302 {0x280a, "es_PE"},
303 {0x500a, "es_PR"},
304 {0x3c0a, "es_PY"},
305 {0x440a, "es_SV"},
306 {0x540a, "es_US"},
307 {0x380a, "es_UY"},
308 {0x200a, "es_VE"},
309 {0x580a, "es_419"},
310 {0x040a, "es_ES@collation=traditional"},
311 {0x040a, "es@collation=traditional"} // Windows will treat this as es-ES@collation=traditional
312 };
313
314 ILCID_POSIX_ELEMENT_ARRAY(0x0425, et, et_EE)
315 ILCID_POSIX_ELEMENT_ARRAY(0x042d, eu, eu_ES)
316
317 /* ISO-639 doesn't distinguish between Persian and Dari.*/
ILCID_POSIX_SUBTABLE(fa)318 ILCID_POSIX_SUBTABLE(fa) {
319 {0x29, "fa"},
320 {0x0429, "fa_IR"}, /* Persian/Farsi (Iran) */
321 {0x048c, "fa_AF"} /* Persian/Dari (Afghanistan) */
322 };
323
324
325 /* duplicate for roundtripping */
ILCID_POSIX_SUBTABLE(fa_AF)326 ILCID_POSIX_SUBTABLE(fa_AF) {
327 {0x8c, "fa_AF"}, /* Persian/Dari (Afghanistan) */
328 {0x048c, "fa_AF"} /* Persian/Dari (Afghanistan) */
329 };
330
ILCID_POSIX_SUBTABLE(ff)331 ILCID_POSIX_SUBTABLE(ff) {
332 {0x67, "ff"},
333 {0x7c67, "ff_Latn"},
334 {0x0867, "ff_Latn_SN"},
335 {0x0467, "ff_NG"}
336 };
337
338 ILCID_POSIX_ELEMENT_ARRAY(0x040b, fi, fi_FI)
339 ILCID_POSIX_ELEMENT_ARRAY(0x0464, fil,fil_PH)
340 ILCID_POSIX_ELEMENT_ARRAY(0x0438, fo, fo_FO)
341
ILCID_POSIX_SUBTABLE(fr)342 ILCID_POSIX_SUBTABLE(fr) {
343 {0x0c, "fr"},
344 {0x080c, "fr_BE"},
345 {0x0c0c, "fr_CA"},
346 {0x240c, "fr_CD"},
347 {0x240c, "fr_CG"},
348 {0x100c, "fr_CH"},
349 {0x300c, "fr_CI"},
350 {0x2c0c, "fr_CM"},
351 {0x040c, "fr_FR"},
352 {0x3c0c, "fr_HT"},
353 {0x140c, "fr_LU"},
354 {0x380c, "fr_MA"},
355 {0x180c, "fr_MC"},
356 {0x340c, "fr_ML"},
357 {0x200c, "fr_RE"},
358 {0x280c, "fr_SN"},
359 {0xe40c, "fr_015"},
360 {0x1c0c, "fr_029"}
361 };
362
363 ILCID_POSIX_ELEMENT_ARRAY(0x0467, fuv, fuv_NG)
364
365 ILCID_POSIX_ELEMENT_ARRAY(0x0462, fy, fy_NL)
366
ILCID_POSIX_SUBTABLE(ga)367 ILCID_POSIX_SUBTABLE(ga) { /* Gaelic (Ireland) */
368 {0x3c, "ga"},
369 {0x083c, "ga_IE"},
370 {0x043c, "gd_GB"}
371 };
372
ILCID_POSIX_SUBTABLE(gd)373 ILCID_POSIX_SUBTABLE(gd) { /* Gaelic (Scotland) */
374 {0x91, "gd"},
375 {0x0491, "gd_GB"}
376 };
377
378 ILCID_POSIX_ELEMENT_ARRAY(0x0456, gl, gl_ES)
379 ILCID_POSIX_ELEMENT_ARRAY(0x0447, gu, gu_IN)
380 ILCID_POSIX_ELEMENT_ARRAY(0x0474, gn, gn_PY)
381 ILCID_POSIX_ELEMENT_ARRAY(0x0484, gsw,gsw_FR)
382
ILCID_POSIX_SUBTABLE(ha)383 ILCID_POSIX_SUBTABLE(ha) {
384 {0x68, "ha"},
385 {0x7c68, "ha_Latn"},
386 {0x0468, "ha_Latn_NG"},
387 };
388
389 ILCID_POSIX_ELEMENT_ARRAY(0x0475, haw,haw_US)
390 ILCID_POSIX_ELEMENT_ARRAY(0x040d, he, he_IL)
391 ILCID_POSIX_ELEMENT_ARRAY(0x0439, hi, hi_IN)
392
393 /* This LCID is really four different locales.*/
ILCID_POSIX_SUBTABLE(hr)394 ILCID_POSIX_SUBTABLE(hr) {
395 {0x1a, "hr"},
396 {0x141a, "bs_Latn_BA"}, /* Bosnian, Bosnia and Herzegovina */
397 {0x681a, "bs_Latn"}, /* Bosnian, Bosnia and Herzegovina */
398 {0x141a, "bs_BA"}, /* Bosnian, Bosnia and Herzegovina */
399 {0x781a, "bs"}, /* Bosnian */
400 {0x201a, "bs_Cyrl_BA"}, /* Bosnian, Bosnia and Herzegovina */
401 {0x641a, "bs_Cyrl"}, /* Bosnian, Bosnia and Herzegovina */
402 {0x101a, "hr_BA"}, /* Croatian in Bosnia */
403 {0x041a, "hr_HR"}, /* Croatian*/
404 {0x2c1a, "sr_Latn_ME"},
405 {0x241a, "sr_Latn_RS"},
406 {0x181a, "sr_Latn_BA"}, /* Serbo-Croatian in Bosnia */
407 {0x081a, "sr_Latn_CS"}, /* Serbo-Croatian*/
408 {0x701a, "sr_Latn"}, /* It's 0x1a or 0x081a, pick one to make the test program happy. */
409 {0x1c1a, "sr_Cyrl_BA"}, /* Serbo-Croatian in Bosnia */
410 {0x0c1a, "sr_Cyrl_CS"}, /* Serbian*/
411 {0x301a, "sr_Cyrl_ME"},
412 {0x281a, "sr_Cyrl_RS"},
413 {0x6c1a, "sr_Cyrl"}, /* It's 0x1a or 0x0c1a, pick one to make the test program happy. */
414 {0x7c1a, "sr"} /* In CLDR sr is sr_Cyrl. */
415 };
416
ILCID_POSIX_SUBTABLE(hsb)417 ILCID_POSIX_SUBTABLE(hsb) {
418 {0x2E, "hsb"},
419 {0x042E, "hsb_DE"},
420 {0x082E, "dsb_DE"},
421 {0x7C2E, "dsb"},
422 };
423
424 ILCID_POSIX_ELEMENT_ARRAY(0x040e, hu, hu_HU)
425 ILCID_POSIX_ELEMENT_ARRAY(0x042b, hy, hy_AM)
426
ILCID_POSIX_SUBTABLE(ibb)427 ILCID_POSIX_SUBTABLE(ibb) {
428 {0x69, "ibb"},
429 {0x0469, "ibb_NG"}
430 };
431
432 ILCID_POSIX_ELEMENT_ARRAY(0x0421, id, id_ID)
433 ILCID_POSIX_ELEMENT_ARRAY(0x0470, ig, ig_NG)
434 ILCID_POSIX_ELEMENT_ARRAY(0x0478, ii, ii_CN)
435 ILCID_POSIX_ELEMENT_ARRAY(0x040f, is, is_IS)
436
ILCID_POSIX_SUBTABLE(it)437 ILCID_POSIX_SUBTABLE(it) {
438 {0x10, "it"},
439 {0x0810, "it_CH"},
440 {0x0410, "it_IT"}
441 };
442
ILCID_POSIX_SUBTABLE(iu)443 ILCID_POSIX_SUBTABLE(iu) {
444 {0x5d, "iu"},
445 {0x045d, "iu_Cans_CA"},
446 {0x785d, "iu_Cans"},
447 {0x085d, "iu_Latn_CA"},
448 {0x7c5d, "iu_Latn"}
449 };
450
451 ILCID_POSIX_ELEMENT_ARRAY(0x040d, iw, iw_IL) /*Left in for compatibility*/
452 ILCID_POSIX_ELEMENT_ARRAY(0x0411, ja, ja_JP)
453 ILCID_POSIX_ELEMENT_ARRAY(0x0437, ka, ka_GE)
454 ILCID_POSIX_ELEMENT_ARRAY(0x043f, kk, kk_KZ)
455 ILCID_POSIX_ELEMENT_ARRAY(0x046f, kl, kl_GL)
456 ILCID_POSIX_ELEMENT_ARRAY(0x0453, km, km_KH)
457 ILCID_POSIX_ELEMENT_ARRAY(0x044b, kn, kn_IN)
458
ILCID_POSIX_SUBTABLE(ko)459 ILCID_POSIX_SUBTABLE(ko) {
460 {0x12, "ko"},
461 {0x0812, "ko_KP"},
462 {0x0412, "ko_KR"}
463 };
464
465 ILCID_POSIX_ELEMENT_ARRAY(0x0457, kok, kok_IN)
466 ILCID_POSIX_ELEMENT_ARRAY(0x0471, kr, kr_NG)
467
ILCID_POSIX_SUBTABLE(ks)468 ILCID_POSIX_SUBTABLE(ks) { /* We could add PK and CN too */
469 {0x60, "ks"},
470 {0x0460, "ks_Arab_IN"},
471 {0x0860, "ks_Deva_IN"}
472 };
473
474 ILCID_POSIX_ELEMENT_ARRAY(0x0440, ky, ky_KG) /* Kyrgyz is spoken in Kyrgyzstan */
475
ILCID_POSIX_SUBTABLE(la)476 ILCID_POSIX_SUBTABLE(la) {
477 {0x76, "la"},
478 {0x0476, "la_001"},
479 {0x0476, "la_IT"} /*Left in for compatibility*/
480 };
481
482 ILCID_POSIX_ELEMENT_ARRAY(0x046e, lb, lb_LU)
483 ILCID_POSIX_ELEMENT_ARRAY(0x0454, lo, lo_LA)
484 ILCID_POSIX_ELEMENT_ARRAY(0x0427, lt, lt_LT)
485 ILCID_POSIX_ELEMENT_ARRAY(0x0426, lv, lv_LV)
486 ILCID_POSIX_ELEMENT_ARRAY(0x0481, mi, mi_NZ)
487 ILCID_POSIX_ELEMENT_ARRAY(0x042f, mk, mk_MK)
488 ILCID_POSIX_ELEMENT_ARRAY(0x044c, ml, ml_IN)
489
ILCID_POSIX_SUBTABLE(mn)490 ILCID_POSIX_SUBTABLE(mn) {
491 {0x50, "mn"},
492 {0x0450, "mn_MN"},
493 {0x7c50, "mn_Mong"},
494 {0x0850, "mn_Mong_CN"},
495 {0x0850, "mn_CN"},
496 {0x7850, "mn_Cyrl"},
497 {0x0c50, "mn_Mong_MN"}
498 };
499
500 ILCID_POSIX_ELEMENT_ARRAY(0x0458, mni,mni_IN)
501 ILCID_POSIX_ELEMENT_ARRAY(0x047c, moh,moh_CA)
502 ILCID_POSIX_ELEMENT_ARRAY(0x044e, mr, mr_IN)
503
ILCID_POSIX_SUBTABLE(ms)504 ILCID_POSIX_SUBTABLE(ms) {
505 {0x3e, "ms"},
506 {0x083e, "ms_BN"}, /* Brunei Darussalam*/
507 {0x043e, "ms_MY"} /* Malaysia*/
508 };
509
510 ILCID_POSIX_ELEMENT_ARRAY(0x043a, mt, mt_MT)
511 ILCID_POSIX_ELEMENT_ARRAY(0x0455, my, my_MM)
512
ILCID_POSIX_SUBTABLE(ne)513 ILCID_POSIX_SUBTABLE(ne) {
514 {0x61, "ne"},
515 {0x0861, "ne_IN"}, /* India*/
516 {0x0461, "ne_NP"} /* Nepal*/
517 };
518
ILCID_POSIX_SUBTABLE(nl)519 ILCID_POSIX_SUBTABLE(nl) {
520 {0x13, "nl"},
521 {0x0813, "nl_BE"},
522 {0x0413, "nl_NL"}
523 };
524
525 /* The "no" locale split into nb and nn. By default in ICU, "no" is nb.*/
526 // TODO: Not all of these are needed on Windows, but I don't know how ICU treats preferred ones here.
ILCID_POSIX_SUBTABLE(no)527 ILCID_POSIX_SUBTABLE(no) {
528 {0x14, "no"}, /* really nb_NO - actually Windows differentiates between neutral (no region) and specific (with region) */
529 {0x7c14, "nb"}, /* really nb */
530 {0x0414, "nb_NO"}, /* really nb_NO. Keep first in the 414 list. */
531 {0x0414, "no_NO"}, /* really nb_NO */
532 {0x0814, "nn_NO"}, /* really nn_NO. Keep first in the 814 list. */
533 {0x7814, "nn"}, /* It's 0x14 or 0x814, pick one to make the test program happy. */
534 {0x0814, "no_NO_NY"}/* really nn_NO */
535 };
536
537 ILCID_POSIX_ELEMENT_ARRAY(0x046c, nso,nso_ZA) /* TODO: Verify the ISO-639 code */
538 ILCID_POSIX_ELEMENT_ARRAY(0x0482, oc, oc_FR)
539
ILCID_POSIX_SUBTABLE(om)540 ILCID_POSIX_SUBTABLE(om) { /* TODO: Verify the country */
541 {0x72, "om"},
542 {0x0472, "om_ET"},
543 {0x0472, "gaz_ET"}
544 };
545
546 /* Declared as or_IN to get around compiler errors*/
ILCID_POSIX_SUBTABLE(or_IN)547 ILCID_POSIX_SUBTABLE(or_IN) {
548 {0x48, "or"},
549 {0x0448, "or_IN"},
550 };
551
ILCID_POSIX_SUBTABLE(pa)552 ILCID_POSIX_SUBTABLE(pa) {
553 {0x46, "pa"},
554 {0x0446, "pa_IN"},
555 {0x0846, "pa_Arab_PK"},
556 {0x0846, "pa_PK"}
557 };
558
ILCID_POSIX_SUBTABLE(pap)559 ILCID_POSIX_SUBTABLE(pap) {
560 {0x79, "pap"},
561 {0x0479, "pap_029"},
562 {0x0479, "pap_AN"} /*Left in for compatibility*/
563 };
564
565 ILCID_POSIX_ELEMENT_ARRAY(0x0415, pl, pl_PL)
566 ILCID_POSIX_ELEMENT_ARRAY(0x0463, ps, ps_AF)
567
ILCID_POSIX_SUBTABLE(pt)568 ILCID_POSIX_SUBTABLE(pt) {
569 {0x16, "pt"},
570 {0x0416, "pt_BR"},
571 {0x0816, "pt_PT"}
572 };
573
ILCID_POSIX_SUBTABLE(qu)574 ILCID_POSIX_SUBTABLE(qu) {
575 {0x6b, "qu"},
576 {0x046b, "qu_BO"},
577 {0x086b, "qu_EC"},
578 {0x0C6b, "qu_PE"},
579 {0x046b, "quz_BO"},
580 {0x086b, "quz_EC"},
581 {0x0C6b, "quz_PE"}
582 };
583
ILCID_POSIX_SUBTABLE(quc)584 ILCID_POSIX_SUBTABLE(quc) {
585 {0x93, "quc"},
586 {0x0493, "quc_CO"},
587 /*
588 "quc_Latn_GT" is an exceptional case. Language ID of "quc"
589 is 0x93, but LCID of "quc_Latn_GT" is 0x486, which should be
590 under the group of "qut". "qut" is a retired ISO 639-3 language
591 code for West Central Quiche, and merged to "quc".
592 It looks Windows previously reserved "qut" for K'iche', but,
593 decided to use "quc" when adding a locale for K'iche' (Guatemala).
594
595 This data structure used here assumes language ID bits in
596 LCID is unique for alphabetic language code. But this is not true
597 for "quc_Latn_GT". If we don't have the data below, LCID look up
598 by alphabetic locale ID (POSIX) will fail. The same entry is found
599 under "qut" below, which is required for reverse look up.
600 */
601 {0x0486, "quc_Latn_GT"}
602 };
603
ILCID_POSIX_SUBTABLE(qut)604 ILCID_POSIX_SUBTABLE(qut) {
605 {0x86, "qut"},
606 {0x0486, "qut_GT"},
607 /*
608 See the note in "quc" above.
609 */
610 {0x0486, "quc_Latn_GT"}
611 };
612
613 ILCID_POSIX_ELEMENT_ARRAY(0x0417, rm, rm_CH)
614
ILCID_POSIX_SUBTABLE(ro)615 ILCID_POSIX_SUBTABLE(ro) {
616 {0x18, "ro"},
617 {0x0418, "ro_RO"},
618 {0x0818, "ro_MD"}
619 };
620
621 // TODO: This is almost certainly 'wrong'. 0 in Windows is a synonym for LOCALE_USER_DEFAULT.
622 // More likely this is a similar concept to the Windows 0x7f Invariant locale ""
623 // (Except that it's not invariant in ICU)
ILCID_POSIX_SUBTABLE(root)624 ILCID_POSIX_SUBTABLE(root) {
625 {0x00, "root"}
626 };
627
ILCID_POSIX_SUBTABLE(ru)628 ILCID_POSIX_SUBTABLE(ru) {
629 {0x19, "ru"},
630 {0x0419, "ru_RU"},
631 {0x0819, "ru_MD"}
632 };
633
634 ILCID_POSIX_ELEMENT_ARRAY(0x0487, rw, rw_RW)
635 ILCID_POSIX_ELEMENT_ARRAY(0x044f, sa, sa_IN)
636 ILCID_POSIX_ELEMENT_ARRAY(0x0485, sah,sah_RU)
637
ILCID_POSIX_SUBTABLE(sd)638 ILCID_POSIX_SUBTABLE(sd) {
639 {0x59, "sd"},
640 {0x0459, "sd_Deva_IN"},
641 {0x0459, "sd_IN"},
642 {0x0859, "sd_Arab_PK"},
643 {0x0859, "sd_PK"},
644 {0x7c59, "sd_Arab"}
645 };
646
ILCID_POSIX_SUBTABLE(se)647 ILCID_POSIX_SUBTABLE(se) {
648 {0x3b, "se"},
649 {0x0c3b, "se_FI"},
650 {0x043b, "se_NO"},
651 {0x083b, "se_SE"},
652 {0x783b, "sma"},
653 {0x183b, "sma_NO"},
654 {0x1c3b, "sma_SE"},
655 {0x7c3b, "smj"},
656 {0x703b, "smn"},
657 {0x743b, "sms"},
658 {0x103b, "smj_NO"},
659 {0x143b, "smj_SE"},
660 {0x243b, "smn_FI"},
661 {0x203b, "sms_FI"},
662 };
663
664 ILCID_POSIX_ELEMENT_ARRAY(0x045b, si, si_LK)
665 ILCID_POSIX_ELEMENT_ARRAY(0x041b, sk, sk_SK)
666 ILCID_POSIX_ELEMENT_ARRAY(0x0424, sl, sl_SI)
667
ILCID_POSIX_SUBTABLE(so)668 ILCID_POSIX_SUBTABLE(so) {
669 {0x77, "so"},
670 {0x0477, "so_SO"}
671 };
672
673 ILCID_POSIX_ELEMENT_ARRAY(0x041c, sq, sq_AL)
674 ILCID_POSIX_ELEMENT_ARRAY(0x0430, st, st_ZA)
675
ILCID_POSIX_SUBTABLE(sv)676 ILCID_POSIX_SUBTABLE(sv) {
677 {0x1d, "sv"},
678 {0x081d, "sv_FI"},
679 {0x041d, "sv_SE"}
680 };
681
682 ILCID_POSIX_ELEMENT_ARRAY(0x0441, sw, sw_KE)
683 ILCID_POSIX_ELEMENT_ARRAY(0x045A, syr, syr_SY)
684
ILCID_POSIX_SUBTABLE(ta)685 ILCID_POSIX_SUBTABLE(ta) {
686 {0x49, "ta"},
687 {0x0449, "ta_IN"},
688 {0x0849, "ta_LK"}
689 };
690
691 ILCID_POSIX_ELEMENT_ARRAY(0x044a, te, te_IN)
692
693 /* Cyrillic based by default */
ILCID_POSIX_SUBTABLE(tg)694 ILCID_POSIX_SUBTABLE(tg) {
695 {0x28, "tg"},
696 {0x7c28, "tg_Cyrl"},
697 {0x0428, "tg_Cyrl_TJ"}
698 };
699
700 ILCID_POSIX_ELEMENT_ARRAY(0x041e, th, th_TH)
701
ILCID_POSIX_SUBTABLE(ti)702 ILCID_POSIX_SUBTABLE(ti) {
703 {0x73, "ti"},
704 {0x0873, "ti_ER"},
705 {0x0473, "ti_ET"}
706 };
707
708 ILCID_POSIX_ELEMENT_ARRAY(0x0442, tk, tk_TM)
709
ILCID_POSIX_SUBTABLE(tn)710 ILCID_POSIX_SUBTABLE(tn) {
711 {0x32, "tn"},
712 {0x0832, "tn_BW"},
713 {0x0432, "tn_ZA"}
714 };
715
716 ILCID_POSIX_ELEMENT_ARRAY(0x041f, tr, tr_TR)
717 ILCID_POSIX_ELEMENT_ARRAY(0x0431, ts, ts_ZA)
718 ILCID_POSIX_ELEMENT_ARRAY(0x0444, tt, tt_RU)
719
ILCID_POSIX_SUBTABLE(tzm)720 ILCID_POSIX_SUBTABLE(tzm) {
721 {0x5f, "tzm"},
722 {0x7c5f, "tzm_Latn"},
723 {0x085f, "tzm_Latn_DZ"},
724 {0x105f, "tzm_Tfng_MA"},
725 {0x045f, "tzm_Arab_MA"},
726 {0x045f, "tmz"}
727 };
728
ILCID_POSIX_SUBTABLE(ug)729 ILCID_POSIX_SUBTABLE(ug) {
730 {0x80, "ug"},
731 {0x0480, "ug_CN"},
732 {0x0480, "ug_Arab_CN"}
733 };
734
735 ILCID_POSIX_ELEMENT_ARRAY(0x0422, uk, uk_UA)
736
ILCID_POSIX_SUBTABLE(ur)737 ILCID_POSIX_SUBTABLE(ur) {
738 {0x20, "ur"},
739 {0x0820, "ur_IN"},
740 {0x0420, "ur_PK"}
741 };
742
ILCID_POSIX_SUBTABLE(uz)743 ILCID_POSIX_SUBTABLE(uz) {
744 {0x43, "uz"},
745 {0x0843, "uz_Cyrl_UZ"}, /* Cyrillic based */
746 {0x7843, "uz_Cyrl"}, /* Cyrillic based */
747 {0x0843, "uz_UZ"}, /* Cyrillic based */
748 {0x0443, "uz_Latn_UZ"}, /* Latin based */
749 {0x7c43, "uz_Latn"} /* Latin based */
750 };
751
ILCID_POSIX_SUBTABLE(ve)752 ILCID_POSIX_SUBTABLE(ve) { /* TODO: Verify the country */
753 {0x33, "ve"},
754 {0x0433, "ve_ZA"},
755 {0x0433, "ven_ZA"}
756 };
757
758 ILCID_POSIX_ELEMENT_ARRAY(0x042a, vi, vi_VN)
759 ILCID_POSIX_ELEMENT_ARRAY(0x0488, wo, wo_SN)
760 ILCID_POSIX_ELEMENT_ARRAY(0x0434, xh, xh_ZA)
761
ILCID_POSIX_SUBTABLE(yi)762 ILCID_POSIX_SUBTABLE(yi) {
763 {0x003d, "yi"},
764 {0x043d, "yi_001"}
765 };
766
767 ILCID_POSIX_ELEMENT_ARRAY(0x046a, yo, yo_NG)
768
769 // Windows & ICU tend to different names for some of these
770 // TODO: Windows probably does not need all of these entries, but I don't know how the precedence works.
ILCID_POSIX_SUBTABLE(zh)771 ILCID_POSIX_SUBTABLE(zh) {
772 {0x0004, "zh_Hans"},
773 {0x7804, "zh"},
774 {0x0804, "zh_CN"},
775 {0x0804, "zh_Hans_CN"},
776 {0x0c04, "zh_Hant_HK"},
777 {0x0c04, "zh_HK"},
778 {0x1404, "zh_Hant_MO"},
779 {0x1404, "zh_MO"},
780 {0x1004, "zh_Hans_SG"},
781 {0x1004, "zh_SG"},
782 {0x0404, "zh_Hant_TW"},
783 {0x7c04, "zh_Hant"},
784 {0x0404, "zh_TW"},
785 {0x30404,"zh_Hant_TW"}, /* Bopomofo order */
786 {0x30404,"zh_TW"}, /* Bopomofo order */
787 {0x20004,"zh@collation=stroke"},
788 {0x20404,"zh_Hant@collation=stroke"},
789 {0x20404,"zh_Hant_TW@collation=stroke"},
790 {0x20404,"zh_TW@collation=stroke"},
791 {0x20804,"zh_Hans@collation=stroke"},
792 {0x20804,"zh_Hans_CN@collation=stroke"},
793 {0x20804,"zh_CN@collation=stroke"}
794 // TODO: Alternate collations for other LCIDs are missing, eg: 0x50804
795 };
796
797 ILCID_POSIX_ELEMENT_ARRAY(0x0435, zu, zu_ZA)
798
799 /* This must be static and grouped by LCID. */
800 constexpr ILcidPosixMap gPosixIDmap[] = {
801 ILCID_POSIX_MAP(af), /* af Afrikaans 0x36 */
802 ILCID_POSIX_MAP(am), /* am Amharic 0x5e */
803 ILCID_POSIX_MAP(ar), /* ar Arabic 0x01 */
804 ILCID_POSIX_MAP(arn), /* arn Araucanian/Mapudungun 0x7a */
805 ILCID_POSIX_MAP(as), /* as Assamese 0x4d */
806 ILCID_POSIX_MAP(az), /* az Azerbaijani 0x2c */
807 ILCID_POSIX_MAP(ba), /* ba Bashkir 0x6d */
808 ILCID_POSIX_MAP(be), /* be Belarusian 0x23 */
809 /* ILCID_POSIX_MAP(ber), ber Berber/Tamazight 0x5f */
810 ILCID_POSIX_MAP(bg), /* bg Bulgarian 0x02 */
811 ILCID_POSIX_MAP(bin), /* bin Edo 0x66 */
812 ILCID_POSIX_MAP(bn), /* bn Bengali; Bangla 0x45 */
813 ILCID_POSIX_MAP(bo), /* bo Tibetan 0x51 */
814 ILCID_POSIX_MAP(br), /* br Breton 0x7e */
815 ILCID_POSIX_MAP(ca), /* ca Catalan 0x03 */
816 ILCID_POSIX_MAP(chr), /* chr Cherokee 0x5c */
817 ILCID_POSIX_MAP(ckb), /* ckb Sorani (Central Kurdish) 0x92 */
818 ILCID_POSIX_MAP(co), /* co Corsican 0x83 */
819 ILCID_POSIX_MAP(cs), /* cs Czech 0x05 */
820 ILCID_POSIX_MAP(cy), /* cy Welsh 0x52 */
821 ILCID_POSIX_MAP(da), /* da Danish 0x06 */
822 ILCID_POSIX_MAP(de), /* de German 0x07 */
823 ILCID_POSIX_MAP(dv), /* dv Divehi 0x65 */
824 ILCID_POSIX_MAP(el), /* el Greek 0x08 */
825 ILCID_POSIX_MAP(en), /* en English 0x09 */
826 ILCID_POSIX_MAP(en_US_POSIX), /* invariant 0x7f */
827 ILCID_POSIX_MAP(es), /* es Spanish 0x0a */
828 ILCID_POSIX_MAP(et), /* et Estonian 0x25 */
829 ILCID_POSIX_MAP(eu), /* eu Basque 0x2d */
830 ILCID_POSIX_MAP(fa), /* fa Persian/Farsi 0x29 */
831 ILCID_POSIX_MAP(fa_AF), /* fa Persian/Dari 0x8c */
832 ILCID_POSIX_MAP(ff), /* ff Fula 0x67 */
833 ILCID_POSIX_MAP(fi), /* fi Finnish 0x0b */
834 ILCID_POSIX_MAP(fil), /* fil Filipino 0x64 */
835 ILCID_POSIX_MAP(fo), /* fo Faroese 0x38 */
836 ILCID_POSIX_MAP(fr), /* fr French 0x0c */
837 ILCID_POSIX_MAP(fuv), /* fuv Fulfulde - Nigeria 0x67 */
838 ILCID_POSIX_MAP(fy), /* fy Frisian 0x62 */
839 ILCID_POSIX_MAP(ga), /* * Gaelic (Ireland,Scotland) 0x3c */
840 ILCID_POSIX_MAP(gd), /* gd Gaelic (United Kingdom) 0x91 */
841 ILCID_POSIX_MAP(gl), /* gl Galician 0x56 */
842 ILCID_POSIX_MAP(gn), /* gn Guarani 0x74 */
843 ILCID_POSIX_MAP(gsw), /* gsw Alemanic/Alsatian/Swiss German 0x84 */
844 ILCID_POSIX_MAP(gu), /* gu Gujarati 0x47 */
845 ILCID_POSIX_MAP(ha), /* ha Hausa 0x68 */
846 ILCID_POSIX_MAP(haw), /* haw Hawaiian 0x75 */
847 ILCID_POSIX_MAP(he), /* he Hebrew (formerly iw) 0x0d */
848 ILCID_POSIX_MAP(hi), /* hi Hindi 0x39 */
849 ILCID_POSIX_MAP(hr), /* * Croatian and others 0x1a */
850 ILCID_POSIX_MAP(hsb), /* hsb Upper Sorbian 0x2e */
851 ILCID_POSIX_MAP(hu), /* hu Hungarian 0x0e */
852 ILCID_POSIX_MAP(hy), /* hy Armenian 0x2b */
853 ILCID_POSIX_MAP(ibb), /* ibb Ibibio - Nigeria 0x69 */
854 ILCID_POSIX_MAP(id), /* id Indonesian (formerly in) 0x21 */
855 ILCID_POSIX_MAP(ig), /* ig Igbo 0x70 */
856 ILCID_POSIX_MAP(ii), /* ii Sichuan Yi 0x78 */
857 ILCID_POSIX_MAP(is), /* is Icelandic 0x0f */
858 ILCID_POSIX_MAP(it), /* it Italian 0x10 */
859 ILCID_POSIX_MAP(iu), /* iu Inuktitut 0x5d */
860 ILCID_POSIX_MAP(iw), /* iw Hebrew 0x0d */
861 ILCID_POSIX_MAP(ja), /* ja Japanese 0x11 */
862 ILCID_POSIX_MAP(ka), /* ka Georgian 0x37 */
863 ILCID_POSIX_MAP(kk), /* kk Kazakh 0x3f */
864 ILCID_POSIX_MAP(kl), /* kl Kalaallisut 0x6f */
865 ILCID_POSIX_MAP(km), /* km Khmer 0x53 */
866 ILCID_POSIX_MAP(kn), /* kn Kannada 0x4b */
867 ILCID_POSIX_MAP(ko), /* ko Korean 0x12 */
868 ILCID_POSIX_MAP(kok), /* kok Konkani 0x57 */
869 ILCID_POSIX_MAP(kr), /* kr Kanuri 0x71 */
870 ILCID_POSIX_MAP(ks), /* ks Kashmiri 0x60 */
871 ILCID_POSIX_MAP(ky), /* ky Kyrgyz 0x40 */
872 ILCID_POSIX_MAP(lb), /* lb Luxembourgish 0x6e */
873 ILCID_POSIX_MAP(la), /* la Latin 0x76 */
874 ILCID_POSIX_MAP(lo), /* lo Lao 0x54 */
875 ILCID_POSIX_MAP(lt), /* lt Lithuanian 0x27 */
876 ILCID_POSIX_MAP(lv), /* lv Latvian, Lettish 0x26 */
877 ILCID_POSIX_MAP(mi), /* mi Maori 0x81 */
878 ILCID_POSIX_MAP(mk), /* mk Macedonian 0x2f */
879 ILCID_POSIX_MAP(ml), /* ml Malayalam 0x4c */
880 ILCID_POSIX_MAP(mn), /* mn Mongolian 0x50 */
881 ILCID_POSIX_MAP(mni), /* mni Manipuri 0x58 */
882 ILCID_POSIX_MAP(moh), /* moh Mohawk 0x7c */
883 ILCID_POSIX_MAP(mr), /* mr Marathi 0x4e */
884 ILCID_POSIX_MAP(ms), /* ms Malay 0x3e */
885 ILCID_POSIX_MAP(mt), /* mt Maltese 0x3a */
886 ILCID_POSIX_MAP(my), /* my Burmese 0x55 */
887 /* ILCID_POSIX_MAP(nb), // no Norwegian 0x14 */
888 ILCID_POSIX_MAP(ne), /* ne Nepali 0x61 */
889 ILCID_POSIX_MAP(nl), /* nl Dutch 0x13 */
890 /* ILCID_POSIX_MAP(nn), // no Norwegian 0x14 */
891 ILCID_POSIX_MAP(no), /* * Norwegian 0x14 */
892 ILCID_POSIX_MAP(nso), /* nso Sotho, Northern (Sepedi dialect) 0x6c */
893 ILCID_POSIX_MAP(oc), /* oc Occitan 0x82 */
894 ILCID_POSIX_MAP(om), /* om Oromo 0x72 */
895 ILCID_POSIX_MAP(or_IN), /* or Oriya 0x48 */
896 ILCID_POSIX_MAP(pa), /* pa Punjabi 0x46 */
897 ILCID_POSIX_MAP(pap), /* pap Papiamentu 0x79 */
898 ILCID_POSIX_MAP(pl), /* pl Polish 0x15 */
899 ILCID_POSIX_MAP(ps), /* ps Pashto 0x63 */
900 ILCID_POSIX_MAP(pt), /* pt Portuguese 0x16 */
901 ILCID_POSIX_MAP(qu), /* qu Quechua 0x6B */
902 ILCID_POSIX_MAP(quc), /* quc K'iche 0x93 */
903 ILCID_POSIX_MAP(qut), /* qut K'iche 0x86 */
904 ILCID_POSIX_MAP(rm), /* rm Raeto-Romance/Romansh 0x17 */
905 ILCID_POSIX_MAP(ro), /* ro Romanian 0x18 */
906 ILCID_POSIX_MAP(root), /* root 0x00 */
907 ILCID_POSIX_MAP(ru), /* ru Russian 0x19 */
908 ILCID_POSIX_MAP(rw), /* rw Kinyarwanda 0x87 */
909 ILCID_POSIX_MAP(sa), /* sa Sanskrit 0x4f */
910 ILCID_POSIX_MAP(sah), /* sah Yakut 0x85 */
911 ILCID_POSIX_MAP(sd), /* sd Sindhi 0x59 */
912 ILCID_POSIX_MAP(se), /* se Sami 0x3b */
913 /* ILCID_POSIX_MAP(sh), // sh Serbo-Croatian 0x1a */
914 ILCID_POSIX_MAP(si), /* si Sinhalese 0x5b */
915 ILCID_POSIX_MAP(sk), /* sk Slovak 0x1b */
916 ILCID_POSIX_MAP(sl), /* sl Slovenian 0x24 */
917 ILCID_POSIX_MAP(so), /* so Somali 0x77 */
918 ILCID_POSIX_MAP(sq), /* sq Albanian 0x1c */
919 /* ILCID_POSIX_MAP(sr), // sr Serbian 0x1a */
920 ILCID_POSIX_MAP(st), /* st Sutu 0x30 */
921 ILCID_POSIX_MAP(sv), /* sv Swedish 0x1d */
922 ILCID_POSIX_MAP(sw), /* sw Swahili 0x41 */
923 ILCID_POSIX_MAP(syr), /* syr Syriac 0x5A */
924 ILCID_POSIX_MAP(ta), /* ta Tamil 0x49 */
925 ILCID_POSIX_MAP(te), /* te Telugu 0x4a */
926 ILCID_POSIX_MAP(tg), /* tg Tajik 0x28 */
927 ILCID_POSIX_MAP(th), /* th Thai 0x1e */
928 ILCID_POSIX_MAP(ti), /* ti Tigrigna 0x73 */
929 ILCID_POSIX_MAP(tk), /* tk Turkmen 0x42 */
930 ILCID_POSIX_MAP(tn), /* tn Tswana 0x32 */
931 ILCID_POSIX_MAP(tr), /* tr Turkish 0x1f */
932 ILCID_POSIX_MAP(ts), /* ts Tsonga 0x31 */
933 ILCID_POSIX_MAP(tt), /* tt Tatar 0x44 */
934 ILCID_POSIX_MAP(tzm), /* tzm Tamazight 0x5f */
935 ILCID_POSIX_MAP(ug), /* ug Uighur 0x80 */
936 ILCID_POSIX_MAP(uk), /* uk Ukrainian 0x22 */
937 ILCID_POSIX_MAP(ur), /* ur Urdu 0x20 */
938 ILCID_POSIX_MAP(uz), /* uz Uzbek 0x43 */
939 ILCID_POSIX_MAP(ve), /* ve Venda 0x33 */
940 ILCID_POSIX_MAP(vi), /* vi Vietnamese 0x2a */
941 ILCID_POSIX_MAP(wo), /* wo Wolof 0x88 */
942 ILCID_POSIX_MAP(xh), /* xh Xhosa 0x34 */
943 ILCID_POSIX_MAP(yi), /* yi Yiddish 0x3d */
944 ILCID_POSIX_MAP(yo), /* yo Yoruba 0x6a */
945 ILCID_POSIX_MAP(zh), /* zh Chinese 0x04 */
946 ILCID_POSIX_MAP(zu), /* zu Zulu 0x35 */
947 };
948
949 constexpr uint32_t gLocaleCount = UPRV_LENGTHOF(gPosixIDmap);
950
951 /**
952 * Do not call this function. It is called by hostID.
953 * The function is not private because this struct must stay as a C struct,
954 * and this is an internal class.
955 */
956 int32_t
idCmp(const char * id1,const char * id2)957 idCmp(const char* id1, const char* id2)
958 {
959 int32_t diffIdx = 0;
960 while (*id1 == *id2 && *id1 != 0) {
961 diffIdx++;
962 id1++;
963 id2++;
964 }
965 return diffIdx;
966 }
967
968 /**
969 * Searches for a Windows LCID
970 *
971 * @param posixID the Posix style locale id.
972 * @param status gets set to U_ILLEGAL_ARGUMENT_ERROR when the Posix ID has
973 * no equivalent Windows LCID.
974 * @return the LCID
975 */
976 uint32_t
getHostID(const ILcidPosixMap * this_0,const char * posixID,UErrorCode & status)977 getHostID(const ILcidPosixMap *this_0, const char* posixID, UErrorCode& status)
978 {
979 if (U_FAILURE(status)) { return locmap_root->hostID; }
980 int32_t bestIdx = 0;
981 int32_t bestIdxDiff = 0;
982 int32_t posixIDlen = (int32_t)uprv_strlen(posixID);
983 uint32_t idx;
984
985 for (idx = 0; idx < this_0->numRegions; idx++ ) {
986 int32_t sameChars = idCmp(posixID, this_0->regionMaps[idx].posixID);
987 if (sameChars > bestIdxDiff && this_0->regionMaps[idx].posixID[sameChars] == 0) {
988 if (posixIDlen == sameChars) {
989 /* Exact match */
990 return this_0->regionMaps[idx].hostID;
991 }
992 bestIdxDiff = sameChars;
993 bestIdx = idx;
994 }
995 }
996 /* We asked for something unusual, like en_ZZ, and we try to return the number for the same language. */
997 /* We also have to make sure that sid and si and similar string subsets don't match. */
998 if ((posixID[bestIdxDiff] == '_' || posixID[bestIdxDiff] == '@')
999 && this_0->regionMaps[bestIdx].posixID[bestIdxDiff] == 0)
1000 {
1001 status = U_USING_FALLBACK_WARNING;
1002 return this_0->regionMaps[bestIdx].hostID;
1003 }
1004
1005 /*no match found */
1006 status = U_ILLEGAL_ARGUMENT_ERROR;
1007 return locmap_root->hostID;
1008 }
1009
1010 const char*
getPosixID(const ILcidPosixMap * this_0,uint32_t hostID)1011 getPosixID(const ILcidPosixMap *this_0, uint32_t hostID)
1012 {
1013 uint32_t i;
1014 for (i = 0; i < this_0->numRegions; i++)
1015 {
1016 if (this_0->regionMaps[i].hostID == hostID)
1017 {
1018 return this_0->regionMaps[i].posixID;
1019 }
1020 }
1021
1022 /* If you get here, then no matching region was found,
1023 so return the language id with the wild card region. */
1024 return this_0->regionMaps[0].posixID;
1025 }
1026
1027 /*
1028 //////////////////////////////////////
1029 //
1030 // LCID --> POSIX
1031 //
1032 /////////////////////////////////////
1033 */
1034 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1035 /*
1036 * Various language tags needs to be changed:
1037 * quz -> qu
1038 * prs -> fa
1039 */
FIX_LANGUAGE_ID_TAG(char * buffer,int32_t len)1040 void FIX_LANGUAGE_ID_TAG(char* buffer, int32_t len) {
1041 if (len >= 3) {
1042 if (buffer[0] == 'q' && buffer[1] == 'u' && buffer[2] == 'z') {
1043 buffer[2] = 0;
1044 uprv_strcat(buffer, buffer+3);
1045 } else if (buffer[0] == 'p' && buffer[1] == 'r' && buffer[2] == 's') {
1046 buffer[0] = 'f'; buffer[1] = 'a'; buffer[2] = 0;
1047 uprv_strcat(buffer, buffer+3);
1048 }
1049 }
1050 }
1051 #endif
1052
1053 } // namespace
1054
1055 U_CAPI int32_t
uprv_convertToPosix(uint32_t hostid,char * posixID,int32_t posixIDCapacity,UErrorCode * status)1056 uprv_convertToPosix(uint32_t hostid, char *posixID, int32_t posixIDCapacity, UErrorCode* status)
1057 {
1058 uint16_t langID;
1059 uint32_t localeIndex;
1060 UBool bLookup = true;
1061 const char *pPosixID = nullptr;
1062
1063 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1064 static_assert(ULOC_FULLNAME_CAPACITY > LOCALE_NAME_MAX_LENGTH, "Windows locale names have smaller length than ICU locale names.");
1065
1066 char locName[LOCALE_NAME_MAX_LENGTH] = {};
1067
1068 // Note: Windows primary lang ID 0x92 in LCID is used for Central Kurdish and
1069 // GetLocaleInfo() maps such LCID to "ku". However, CLDR uses "ku" for
1070 // Northern Kurdish and "ckb" for Central Kurdish. For this reason, we cannot
1071 // use the Windows API to resolve locale ID for this specific case.
1072 if ((hostid & 0x3FF) != 0x92) {
1073 int32_t tmpLen = 0;
1074 char16_t windowsLocaleName[LOCALE_NAME_MAX_LENGTH] = {};
1075
1076 // Note: LOCALE_ALLOW_NEUTRAL_NAMES was enabled in Windows7+, prior versions did not handle neutral (no-region) locale names.
1077 tmpLen = LCIDToLocaleName(hostid, (PWSTR)windowsLocaleName, UPRV_LENGTHOF(windowsLocaleName), LOCALE_ALLOW_NEUTRAL_NAMES);
1078 if (tmpLen > 1) {
1079 int32_t i = 0;
1080 // Only need to look up in table if have _, eg for de-de_phoneb type alternate sort.
1081 bLookup = false;
1082 for (i = 0; i < UPRV_LENGTHOF(locName); i++)
1083 {
1084 locName[i] = (char)(windowsLocaleName[i]);
1085
1086 // Windows locale name may contain sorting variant, such as "es-ES_tradnl".
1087 // In such cases, we need special mapping data found in the hardcoded table
1088 // in this source file.
1089 if (windowsLocaleName[i] == L'_')
1090 {
1091 // Keep the base locale, without variant
1092 // TODO: Should these be mapped from _phoneb to @collation=phonebook, etc.?
1093 locName[i] = '\0';
1094 tmpLen = i;
1095 bLookup = true;
1096 break;
1097 }
1098 else if (windowsLocaleName[i] == L'-')
1099 {
1100 // Windows names use -, ICU uses _
1101 locName[i] = '_';
1102 }
1103 else if (windowsLocaleName[i] == L'\0')
1104 {
1105 // No point in doing more work than necessary
1106 break;
1107 }
1108 }
1109 // TODO: Need to understand this better, why isn't it an alias?
1110 FIX_LANGUAGE_ID_TAG(locName, tmpLen);
1111 pPosixID = locName;
1112 }
1113 }
1114 #endif
1115
1116 if (bLookup) {
1117 const char *pCandidate = nullptr;
1118 langID = LANGUAGE_LCID(hostid);
1119
1120 for (localeIndex = 0; localeIndex < gLocaleCount; localeIndex++) {
1121 if (langID == gPosixIDmap[localeIndex].regionMaps->hostID) {
1122 pCandidate = getPosixID(&gPosixIDmap[localeIndex], hostid);
1123 break;
1124 }
1125 }
1126
1127 /* On Windows, when locale name has a variant, we still look up the hardcoded table.
1128 If a match in the hardcoded table is longer than the Windows locale name without
1129 variant, we use the one as the result */
1130 if (pCandidate && (pPosixID == nullptr || uprv_strlen(pCandidate) > uprv_strlen(pPosixID))) {
1131 pPosixID = pCandidate;
1132 }
1133 }
1134
1135 if (pPosixID) {
1136 int32_t resLen = static_cast<int32_t>(uprv_strlen(pPosixID));
1137 int32_t copyLen = resLen <= posixIDCapacity ? resLen : posixIDCapacity;
1138 uprv_memcpy(posixID, pPosixID, copyLen);
1139 if (resLen < posixIDCapacity) {
1140 posixID[resLen] = 0;
1141 if (*status == U_STRING_NOT_TERMINATED_WARNING) {
1142 *status = U_ZERO_ERROR;
1143 }
1144 } else if (resLen == posixIDCapacity) {
1145 *status = U_STRING_NOT_TERMINATED_WARNING;
1146 } else {
1147 *status = U_BUFFER_OVERFLOW_ERROR;
1148 }
1149 return resLen;
1150 }
1151
1152 /* no match found */
1153 *status = U_ILLEGAL_ARGUMENT_ERROR;
1154 return 0;
1155 }
1156
1157 /*
1158 //////////////////////////////////////
1159 //
1160 // POSIX --> LCID
1161 // This should only be called from uloc_getLCID.
1162 // The locale ID must be in canonical form.
1163 //
1164 /////////////////////////////////////
1165 */
1166 U_CAPI uint32_t
uprv_convertToLCIDPlatform(const char * localeID,UErrorCode * status)1167 uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status)
1168 {
1169 if (U_FAILURE(*status)) {
1170 return 0;
1171 }
1172
1173 // The purpose of this function is to leverage the Windows platform name->lcid
1174 // conversion functionality when available.
1175 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1176 int32_t len;
1177 icu::CharString baseName;
1178 const char * mylocaleID = localeID;
1179
1180 // Check any for keywords.
1181 if (uprv_strchr(localeID, '@'))
1182 {
1183 icu::CharString collVal = ulocimp_getKeywordValue(localeID, "collation", *status);
1184 if (U_SUCCESS(*status) && !collVal.isEmpty())
1185 {
1186 // If it contains the keyword collation, return 0 so that the LCID lookup table will be used.
1187 return 0;
1188 }
1189 else
1190 {
1191 // If the locale ID contains keywords other than collation, just use the base name.
1192 baseName = ulocimp_getBaseName(localeID, *status);
1193 if (U_SUCCESS(*status) && !baseName.isEmpty())
1194 {
1195 mylocaleID = baseName.data();
1196 }
1197 }
1198 }
1199
1200 // this will change it from de_DE@collation=phonebook to de-DE-u-co-phonebk form
1201 icu::CharString asciiBCP47Tag = ulocimp_toLanguageTag(mylocaleID, false, *status);
1202
1203 if (U_SUCCESS(*status))
1204 {
1205 // Need it to be UTF-16, not 8-bit
1206 wchar_t bcp47Tag[LOCALE_NAME_MAX_LENGTH] = {};
1207 int32_t i;
1208 for (i = 0; i < UPRV_LENGTHOF(bcp47Tag); i++)
1209 {
1210 if (asciiBCP47Tag[i] == '\0')
1211 {
1212 break;
1213 }
1214 else
1215 {
1216 // Copy the character
1217 bcp47Tag[i] = static_cast<wchar_t>(asciiBCP47Tag[i]);
1218 }
1219 }
1220
1221 if (i < (UPRV_LENGTHOF(bcp47Tag) - 1))
1222 {
1223 // Ensure it's null terminated
1224 bcp47Tag[i] = L'\0';
1225 LCID lcid = LocaleNameToLCID(bcp47Tag, LOCALE_ALLOW_NEUTRAL_NAMES);
1226 if (lcid > 0)
1227 {
1228 // Found LCID from windows, return that one, unless its completely ambiguous
1229 // LOCALE_USER_DEFAULT and transients are OK because they will round trip
1230 // for this process.
1231 if (lcid != LOCALE_CUSTOM_UNSPECIFIED)
1232 {
1233 return lcid;
1234 }
1235 }
1236 }
1237 }
1238 #else
1239 (void) localeID; // Suppress unused variable warning.
1240 #endif
1241
1242 // Nothing found, or not implemented.
1243 return 0;
1244 }
1245
1246 U_CAPI uint32_t
uprv_convertToLCID(const char * langID,const char * posixID,UErrorCode * status)1247 uprv_convertToLCID(const char *langID, const char* posixID, UErrorCode* status)
1248 {
1249 if (U_FAILURE(*status) ||
1250 langID == nullptr ||
1251 posixID == nullptr ||
1252 uprv_strlen(langID) < 2 ||
1253 uprv_strlen(posixID) < 2) {
1254 return locmap_root->hostID;
1255 }
1256
1257 // This function does the table lookup when native platform name->lcid conversion isn't available,
1258 // or for locales that don't follow patterns the platform expects.
1259 uint32_t low = 0;
1260 uint32_t high = gLocaleCount;
1261 uint32_t mid;
1262 uint32_t oldmid = 0;
1263 int32_t compVal;
1264
1265 uint32_t value = 0;
1266 uint32_t fallbackValue = (uint32_t)-1;
1267 UErrorCode myStatus;
1268 uint32_t idx;
1269
1270 /*Binary search for the map entry for normal cases */
1271
1272 while (high > low) /*binary search*/{
1273
1274 mid = (high+low) >> 1; /*Finds median*/
1275
1276 if (mid == oldmid)
1277 break;
1278
1279 compVal = uprv_strcmp(langID, gPosixIDmap[mid].regionMaps->posixID);
1280 if (compVal < 0){
1281 high = mid;
1282 }
1283 else if (compVal > 0){
1284 low = mid;
1285 }
1286 else /*we found it*/{
1287 return getHostID(&gPosixIDmap[mid], posixID, *status);
1288 }
1289 oldmid = mid;
1290 }
1291
1292 /*
1293 * Sometimes we can't do a binary search on posixID because some LCIDs
1294 * go to different locales. We hit one of those special cases.
1295 */
1296 for (idx = 0; idx < gLocaleCount; idx++ ) {
1297 myStatus = U_ZERO_ERROR;
1298 value = getHostID(&gPosixIDmap[idx], posixID, myStatus);
1299 if (myStatus == U_ZERO_ERROR) {
1300 return value;
1301 }
1302 else if (myStatus == U_USING_FALLBACK_WARNING) {
1303 fallbackValue = value;
1304 }
1305 }
1306
1307 if (fallbackValue != (uint32_t)-1) {
1308 *status = U_USING_FALLBACK_WARNING;
1309 return fallbackValue;
1310 }
1311
1312 /* no match found */
1313 *status = U_ILLEGAL_ARGUMENT_ERROR;
1314 return locmap_root->hostID; /* return international (root) */
1315 }
1316