1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * File locid.cpp
10 *
11 * Created by: Richard Gillam
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 02/11/97 aliu Changed gLocPath to fgDataDirectory and added
17 * methods to get and set it.
18 * 04/02/97 aliu Made operator!= inline; fixed return value
19 * of getName().
20 * 04/15/97 aliu Cleanup for AIX/Win32.
21 * 04/24/97 aliu Numerous changes per code review.
22 * 08/18/98 stephen Changed getDisplayName()
23 * Added SIMPLIFIED_CHINESE, TRADITIONAL_CHINESE
24 * Added getISOCountries(), getISOLanguages(),
25 * getLanguagesForCountry()
26 * 03/16/99 bertrand rehaul.
27 * 07/21/99 stephen Added U_CFUNC setDefault
28 * 11/09/99 weiv Added const char * getName() const;
29 * 04/12/00 srl removing unicodestring api's and cached hash code
30 * 08/10/01 grhoten Change the static Locales to accessor functions
31 ******************************************************************************
32 */
33
34 #include <utility>
35
36 #include "unicode/bytestream.h"
37 #include "unicode/locid.h"
38 #include "unicode/localebuilder.h"
39 #include "unicode/strenum.h"
40 #include "unicode/stringpiece.h"
41 #include "unicode/uloc.h"
42 #include "unicode/ures.h"
43
44 #include "bytesinkutil.h"
45 #include "charstr.h"
46 #include "charstrmap.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "mutex.h"
50 #include "putilimp.h"
51 #include "uassert.h"
52 #include "ucln_cmn.h"
53 #include "uhash.h"
54 #include "ulocimp.h"
55 #include "umutex.h"
56 #include "uniquecharstr.h"
57 #include "ustr_imp.h"
58 #include "uvector.h"
59
60 U_CDECL_BEGIN
61 static UBool U_CALLCONV locale_cleanup(void);
62 U_CDECL_END
63
64 U_NAMESPACE_BEGIN
65
66 static Locale *gLocaleCache = NULL;
67 static UInitOnce gLocaleCacheInitOnce = U_INITONCE_INITIALIZER;
68
69 // gDefaultLocaleMutex protects all access to gDefaultLocalesHashT and gDefaultLocale.
70 static UMutex gDefaultLocaleMutex;
71 static UHashtable *gDefaultLocalesHashT = NULL;
72 static Locale *gDefaultLocale = NULL;
73
74 /**
75 * \def ULOC_STRING_LIMIT
76 * strings beyond this value crash in CharString
77 */
78 #define ULOC_STRING_LIMIT 357913941
79
80 U_NAMESPACE_END
81
82 typedef enum ELocalePos {
83 eENGLISH,
84 eFRENCH,
85 eGERMAN,
86 eITALIAN,
87 eJAPANESE,
88 eKOREAN,
89 eCHINESE,
90
91 eFRANCE,
92 eGERMANY,
93 eITALY,
94 eJAPAN,
95 eKOREA,
96 eCHINA, /* Alias for PRC */
97 eTAIWAN,
98 eUK,
99 eUS,
100 eCANADA,
101 eCANADA_FRENCH,
102 eROOT,
103
104
105 //eDEFAULT,
106 eMAX_LOCALES
107 } ELocalePos;
108
109 U_CDECL_BEGIN
110 //
111 // Deleter function for Locales owned by the default Locale hash table/
112 //
113 static void U_CALLCONV
deleteLocale(void * obj)114 deleteLocale(void *obj) {
115 delete (icu::Locale *) obj;
116 }
117
locale_cleanup(void)118 static UBool U_CALLCONV locale_cleanup(void)
119 {
120 U_NAMESPACE_USE
121
122 delete [] gLocaleCache;
123 gLocaleCache = NULL;
124 gLocaleCacheInitOnce.reset();
125
126 if (gDefaultLocalesHashT) {
127 uhash_close(gDefaultLocalesHashT); // Automatically deletes all elements, using deleter func.
128 gDefaultLocalesHashT = NULL;
129 }
130 gDefaultLocale = NULL;
131 return TRUE;
132 }
133
134
locale_init(UErrorCode & status)135 static void U_CALLCONV locale_init(UErrorCode &status) {
136 U_NAMESPACE_USE
137
138 U_ASSERT(gLocaleCache == NULL);
139 gLocaleCache = new Locale[(int)eMAX_LOCALES];
140 if (gLocaleCache == NULL) {
141 status = U_MEMORY_ALLOCATION_ERROR;
142 return;
143 }
144 ucln_common_registerCleanup(UCLN_COMMON_LOCALE, locale_cleanup);
145 gLocaleCache[eROOT] = Locale("");
146 gLocaleCache[eENGLISH] = Locale("en");
147 gLocaleCache[eFRENCH] = Locale("fr");
148 gLocaleCache[eGERMAN] = Locale("de");
149 gLocaleCache[eITALIAN] = Locale("it");
150 gLocaleCache[eJAPANESE] = Locale("ja");
151 gLocaleCache[eKOREAN] = Locale("ko");
152 gLocaleCache[eCHINESE] = Locale("zh");
153 gLocaleCache[eFRANCE] = Locale("fr", "FR");
154 gLocaleCache[eGERMANY] = Locale("de", "DE");
155 gLocaleCache[eITALY] = Locale("it", "IT");
156 gLocaleCache[eJAPAN] = Locale("ja", "JP");
157 gLocaleCache[eKOREA] = Locale("ko", "KR");
158 gLocaleCache[eCHINA] = Locale("zh", "CN");
159 gLocaleCache[eTAIWAN] = Locale("zh", "TW");
160 gLocaleCache[eUK] = Locale("en", "GB");
161 gLocaleCache[eUS] = Locale("en", "US");
162 gLocaleCache[eCANADA] = Locale("en", "CA");
163 gLocaleCache[eCANADA_FRENCH] = Locale("fr", "CA");
164 }
165
166 U_CDECL_END
167
168 U_NAMESPACE_BEGIN
169
locale_set_default_internal(const char * id,UErrorCode & status)170 Locale *locale_set_default_internal(const char *id, UErrorCode& status) {
171 // Synchronize this entire function.
172 Mutex lock(&gDefaultLocaleMutex);
173
174 UBool canonicalize = FALSE;
175
176 // If given a NULL string for the locale id, grab the default
177 // name from the system.
178 // (Different from most other locale APIs, where a null name means use
179 // the current ICU default locale.)
180 if (id == NULL) {
181 id = uprv_getDefaultLocaleID(); // This function not thread safe? TODO: verify.
182 canonicalize = TRUE; // always canonicalize host ID
183 }
184
185 CharString localeNameBuf;
186 {
187 CharStringByteSink sink(&localeNameBuf);
188 if (canonicalize) {
189 ulocimp_canonicalize(id, sink, &status);
190 } else {
191 ulocimp_getName(id, sink, &status);
192 }
193 }
194
195 if (U_FAILURE(status)) {
196 return gDefaultLocale;
197 }
198
199 if (gDefaultLocalesHashT == NULL) {
200 gDefaultLocalesHashT = uhash_open(uhash_hashChars, uhash_compareChars, NULL, &status);
201 if (U_FAILURE(status)) {
202 return gDefaultLocale;
203 }
204 uhash_setValueDeleter(gDefaultLocalesHashT, deleteLocale);
205 ucln_common_registerCleanup(UCLN_COMMON_LOCALE, locale_cleanup);
206 }
207
208 Locale *newDefault = (Locale *)uhash_get(gDefaultLocalesHashT, localeNameBuf.data());
209 if (newDefault == NULL) {
210 newDefault = new Locale(Locale::eBOGUS);
211 if (newDefault == NULL) {
212 status = U_MEMORY_ALLOCATION_ERROR;
213 return gDefaultLocale;
214 }
215 newDefault->init(localeNameBuf.data(), FALSE);
216 uhash_put(gDefaultLocalesHashT, (char*) newDefault->getName(), newDefault, &status);
217 if (U_FAILURE(status)) {
218 return gDefaultLocale;
219 }
220 }
221 gDefaultLocale = newDefault;
222 return gDefaultLocale;
223 }
224
225 U_NAMESPACE_END
226
227 /* sfb 07/21/99 */
228 U_CFUNC void
locale_set_default(const char * id)229 locale_set_default(const char *id)
230 {
231 U_NAMESPACE_USE
232 UErrorCode status = U_ZERO_ERROR;
233 locale_set_default_internal(id, status);
234 }
235 /* end */
236
237 U_CFUNC const char *
locale_get_default(void)238 locale_get_default(void)
239 {
240 U_NAMESPACE_USE
241 return Locale::getDefault().getName();
242 }
243
244
245 U_NAMESPACE_BEGIN
246
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale)247 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale)
248
249 /*Character separating the posix id fields*/
250 // '_'
251 // In the platform codepage.
252 #define SEP_CHAR '_'
253 #define NULL_CHAR '\0'
254
255 Locale::~Locale()
256 {
257 if (baseName != fullName) {
258 uprv_free(baseName);
259 }
260 baseName = NULL;
261 /*if fullName is on the heap, we free it*/
262 if (fullName != fullNameBuffer)
263 {
264 uprv_free(fullName);
265 fullName = NULL;
266 }
267 }
268
Locale()269 Locale::Locale()
270 : UObject(), fullName(fullNameBuffer), baseName(NULL)
271 {
272 init(NULL, FALSE);
273 }
274
275 /*
276 * Internal constructor to allow construction of a locale object with
277 * NO side effects. (Default constructor tries to get
278 * the default locale.)
279 */
Locale(Locale::ELocaleType)280 Locale::Locale(Locale::ELocaleType)
281 : UObject(), fullName(fullNameBuffer), baseName(NULL)
282 {
283 setToBogus();
284 }
285
286
Locale(const char * newLanguage,const char * newCountry,const char * newVariant,const char * newKeywords)287 Locale::Locale( const char * newLanguage,
288 const char * newCountry,
289 const char * newVariant,
290 const char * newKeywords)
291 : UObject(), fullName(fullNameBuffer), baseName(NULL)
292 {
293 if( (newLanguage==NULL) && (newCountry == NULL) && (newVariant == NULL) )
294 {
295 init(NULL, FALSE); /* shortcut */
296 }
297 else
298 {
299 UErrorCode status = U_ZERO_ERROR;
300 int32_t size = 0;
301 int32_t lsize = 0;
302 int32_t csize = 0;
303 int32_t vsize = 0;
304 int32_t ksize = 0;
305
306 // Calculate the size of the resulting string.
307
308 // Language
309 if ( newLanguage != NULL )
310 {
311 lsize = (int32_t)uprv_strlen(newLanguage);
312 if ( lsize < 0 || lsize > ULOC_STRING_LIMIT ) { // int32 wrap
313 setToBogus();
314 return;
315 }
316 size = lsize;
317 }
318
319 CharString togo(newLanguage, lsize, status); // start with newLanguage
320
321 // _Country
322 if ( newCountry != NULL )
323 {
324 csize = (int32_t)uprv_strlen(newCountry);
325 if ( csize < 0 || csize > ULOC_STRING_LIMIT ) { // int32 wrap
326 setToBogus();
327 return;
328 }
329 size += csize;
330 }
331
332 // _Variant
333 if ( newVariant != NULL )
334 {
335 // remove leading _'s
336 while(newVariant[0] == SEP_CHAR)
337 {
338 newVariant++;
339 }
340
341 // remove trailing _'s
342 vsize = (int32_t)uprv_strlen(newVariant);
343 if ( vsize < 0 || vsize > ULOC_STRING_LIMIT ) { // int32 wrap
344 setToBogus();
345 return;
346 }
347 while( (vsize>1) && (newVariant[vsize-1] == SEP_CHAR) )
348 {
349 vsize--;
350 }
351 }
352
353 if( vsize > 0 )
354 {
355 size += vsize;
356 }
357
358 // Separator rules:
359 if ( vsize > 0 )
360 {
361 size += 2; // at least: __v
362 }
363 else if ( csize > 0 )
364 {
365 size += 1; // at least: _v
366 }
367
368 if ( newKeywords != NULL)
369 {
370 ksize = (int32_t)uprv_strlen(newKeywords);
371 if ( ksize < 0 || ksize > ULOC_STRING_LIMIT ) {
372 setToBogus();
373 return;
374 }
375 size += ksize + 1;
376 }
377
378 // NOW we have the full locale string..
379 // Now, copy it back.
380
381 // newLanguage is already copied
382
383 if ( ( vsize != 0 ) || (csize != 0) ) // at least: __v
384 { // ^
385 togo.append(SEP_CHAR, status);
386 }
387
388 if ( csize != 0 )
389 {
390 togo.append(newCountry, status);
391 }
392
393 if ( vsize != 0)
394 {
395 togo.append(SEP_CHAR, status)
396 .append(newVariant, vsize, status);
397 }
398
399 if ( ksize != 0)
400 {
401 if (uprv_strchr(newKeywords, '=')) {
402 togo.append('@', status); /* keyword parsing */
403 }
404 else {
405 togo.append('_', status); /* Variant parsing with a script */
406 if ( vsize == 0) {
407 togo.append('_', status); /* No country found */
408 }
409 }
410 togo.append(newKeywords, status);
411 }
412
413 if (U_FAILURE(status)) {
414 // Something went wrong with appending, etc.
415 setToBogus();
416 return;
417 }
418 // Parse it, because for example 'language' might really be a complete
419 // string.
420 init(togo.data(), FALSE);
421 }
422 }
423
Locale(const Locale & other)424 Locale::Locale(const Locale &other)
425 : UObject(other), fullName(fullNameBuffer), baseName(NULL)
426 {
427 *this = other;
428 }
429
Locale(Locale && other)430 Locale::Locale(Locale&& other) U_NOEXCEPT
431 : UObject(other), fullName(fullNameBuffer), baseName(fullName) {
432 *this = std::move(other);
433 }
434
operator =(const Locale & other)435 Locale& Locale::operator=(const Locale& other) {
436 if (this == &other) {
437 return *this;
438 }
439
440 setToBogus();
441
442 if (other.fullName == other.fullNameBuffer) {
443 uprv_strcpy(fullNameBuffer, other.fullNameBuffer);
444 } else if (other.fullName == nullptr) {
445 fullName = nullptr;
446 } else {
447 fullName = uprv_strdup(other.fullName);
448 if (fullName == nullptr) return *this;
449 }
450
451 if (other.baseName == other.fullName) {
452 baseName = fullName;
453 } else if (other.baseName != nullptr) {
454 baseName = uprv_strdup(other.baseName);
455 if (baseName == nullptr) return *this;
456 }
457
458 uprv_strcpy(language, other.language);
459 uprv_strcpy(script, other.script);
460 uprv_strcpy(country, other.country);
461
462 variantBegin = other.variantBegin;
463 fIsBogus = other.fIsBogus;
464
465 return *this;
466 }
467
operator =(Locale && other)468 Locale& Locale::operator=(Locale&& other) U_NOEXCEPT {
469 if (baseName != fullName) uprv_free(baseName);
470 if (fullName != fullNameBuffer) uprv_free(fullName);
471
472 if (other.fullName == other.fullNameBuffer) {
473 uprv_strcpy(fullNameBuffer, other.fullNameBuffer);
474 fullName = fullNameBuffer;
475 } else {
476 fullName = other.fullName;
477 }
478
479 if (other.baseName == other.fullName) {
480 baseName = fullName;
481 } else {
482 baseName = other.baseName;
483 }
484
485 uprv_strcpy(language, other.language);
486 uprv_strcpy(script, other.script);
487 uprv_strcpy(country, other.country);
488
489 variantBegin = other.variantBegin;
490 fIsBogus = other.fIsBogus;
491
492 other.baseName = other.fullName = other.fullNameBuffer;
493
494 return *this;
495 }
496
497 Locale *
clone() const498 Locale::clone() const {
499 return new Locale(*this);
500 }
501
502 UBool
operator ==(const Locale & other) const503 Locale::operator==( const Locale& other) const
504 {
505 return (uprv_strcmp(other.fullName, fullName) == 0);
506 }
507
508 namespace {
509
510 UInitOnce gKnownCanonicalizedInitOnce = U_INITONCE_INITIALIZER;
511 UHashtable *gKnownCanonicalized = nullptr;
512
513 static const char* const KNOWN_CANONICALIZED[] = {
514 "c",
515 // Commonly used locales known are already canonicalized
516 "af", "af_ZA", "am", "am_ET", "ar", "ar_001", "as", "as_IN", "az", "az_AZ",
517 "be", "be_BY", "bg", "bg_BG", "bn", "bn_IN", "bs", "bs_BA", "ca", "ca_ES",
518 "cs", "cs_CZ", "cy", "cy_GB", "da", "da_DK", "de", "de_DE", "el", "el_GR",
519 "en", "en_GB", "en_US", "es", "es_419", "es_ES", "et", "et_EE", "eu",
520 "eu_ES", "fa", "fa_IR", "fi", "fi_FI", "fil", "fil_PH", "fr", "fr_FR",
521 "ga", "ga_IE", "gl", "gl_ES", "gu", "gu_IN", "he", "he_IL", "hi", "hi_IN",
522 "hr", "hr_HR", "hu", "hu_HU", "hy", "hy_AM", "id", "id_ID", "is", "is_IS",
523 "it", "it_IT", "ja", "ja_JP", "jv", "jv_ID", "ka", "ka_GE", "kk", "kk_KZ",
524 "km", "km_KH", "kn", "kn_IN", "ko", "ko_KR", "ky", "ky_KG", "lo", "lo_LA",
525 "lt", "lt_LT", "lv", "lv_LV", "mk", "mk_MK", "ml", "ml_IN", "mn", "mn_MN",
526 "mr", "mr_IN", "ms", "ms_MY", "my", "my_MM", "nb", "nb_NO", "ne", "ne_NP",
527 "nl", "nl_NL", "or", "or_IN", "pa", "pa_IN", "pl", "pl_PL", "ps", "ps_AF",
528 "pt", "pt_BR", "pt_PT", "ro", "ro_RO", "ru", "ru_RU", "sd", "sd_IN", "si",
529 "si_LK", "sk", "sk_SK", "sl", "sl_SI", "so", "so_SO", "sq", "sq_AL", "sr",
530 "sr_Cyrl_RS", "sr_Latn", "sr_RS", "sv", "sv_SE", "sw", "sw_TZ", "ta",
531 "ta_IN", "te", "te_IN", "th", "th_TH", "tk", "tk_TM", "tr", "tr_TR", "uk",
532 "uk_UA", "ur", "ur_PK", "uz", "uz_UZ", "vi", "vi_VN", "yue", "yue_Hant",
533 "yue_Hant_HK", "yue_HK", "zh", "zh_CN", "zh_Hans", "zh_Hans_CN", "zh_Hant",
534 "zh_Hant_TW", "zh_TW", "zu", "zu_ZA"
535 };
536
cleanupKnownCanonicalized()537 static UBool U_CALLCONV cleanupKnownCanonicalized() {
538 gKnownCanonicalizedInitOnce.reset();
539 if (gKnownCanonicalized) { uhash_close(gKnownCanonicalized); }
540 return TRUE;
541 }
542
loadKnownCanonicalized(UErrorCode & status)543 static void U_CALLCONV loadKnownCanonicalized(UErrorCode &status) {
544 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_KNOWN_CANONICALIZED,
545 cleanupKnownCanonicalized);
546 LocalUHashtablePointer newKnownCanonicalizedMap(
547 uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &status));
548 for (int32_t i = 0;
549 U_SUCCESS(status) && i < UPRV_LENGTHOF(KNOWN_CANONICALIZED);
550 i++) {
551 uhash_puti(newKnownCanonicalizedMap.getAlias(),
552 (void*)KNOWN_CANONICALIZED[i],
553 1, &status);
554 }
555 if (U_FAILURE(status)) {
556 return;
557 }
558
559 gKnownCanonicalized = newKnownCanonicalizedMap.orphan();
560 }
561
562 class AliasData;
563
564 /**
565 * A Builder class to build the alias data.
566 */
567 class AliasDataBuilder {
568 public:
AliasDataBuilder()569 AliasDataBuilder() {
570 }
571
572 // Build the AliasData from resource.
573 AliasData* build(UErrorCode &status);
574
575 private:
576 void readAlias(UResourceBundle* alias,
577 UniqueCharStrings* strings,
578 LocalMemory<const char*>& types,
579 LocalMemory<int32_t>& replacementIndexes,
580 int32_t &length,
581 void (*checkType)(const char* type),
582 void (*checkReplacement)(const UnicodeString& replacement),
583 UErrorCode &status);
584
585 // Read the languageAlias data from alias to
586 // strings+types+replacementIndexes
587 // The number of record will be stored into length.
588 // Allocate length items for types, to store the type field.
589 // Allocate length items for replacementIndexes,
590 // to store the index in the strings for the replacement script.
591 void readLanguageAlias(UResourceBundle* alias,
592 UniqueCharStrings* strings,
593 LocalMemory<const char*>& types,
594 LocalMemory<int32_t>& replacementIndexes,
595 int32_t &length,
596 UErrorCode &status);
597
598 // Read the scriptAlias data from alias to
599 // strings+types+replacementIndexes
600 // Allocate length items for types, to store the type field.
601 // Allocate length items for replacementIndexes,
602 // to store the index in the strings for the replacement script.
603 void readScriptAlias(UResourceBundle* alias,
604 UniqueCharStrings* strings,
605 LocalMemory<const char*>& types,
606 LocalMemory<int32_t>& replacementIndexes,
607 int32_t &length, UErrorCode &status);
608
609 // Read the territoryAlias data from alias to
610 // strings+types+replacementIndexes
611 // Allocate length items for types, to store the type field.
612 // Allocate length items for replacementIndexes,
613 // to store the index in the strings for the replacement script.
614 void readTerritoryAlias(UResourceBundle* alias,
615 UniqueCharStrings* strings,
616 LocalMemory<const char*>& types,
617 LocalMemory<int32_t>& replacementIndexes,
618 int32_t &length, UErrorCode &status);
619
620 // Read the variantAlias data from alias to
621 // strings+types+replacementIndexes
622 // Allocate length items for types, to store the type field.
623 // Allocate length items for replacementIndexes,
624 // to store the index in the strings for the replacement variant.
625 void readVariantAlias(UResourceBundle* alias,
626 UniqueCharStrings* strings,
627 LocalMemory<const char*>& types,
628 LocalMemory<int32_t>& replacementIndexes,
629 int32_t &length, UErrorCode &status);
630 };
631
632 /**
633 * A class to hold the Alias Data.
634 */
635 class AliasData : public UMemory {
636 public:
singleton(UErrorCode & status)637 static const AliasData* singleton(UErrorCode& status) {
638 if (U_FAILURE(status)) {
639 // Do not get into loadData if the status already has error.
640 return nullptr;
641 }
642 umtx_initOnce(AliasData::gInitOnce, &AliasData::loadData, status);
643 return gSingleton;
644 }
645
languageMap() const646 const CharStringMap& languageMap() const { return language; }
scriptMap() const647 const CharStringMap& scriptMap() const { return script; }
territoryMap() const648 const CharStringMap& territoryMap() const { return territory; }
variantMap() const649 const CharStringMap& variantMap() const { return variant; }
650
651 static void U_CALLCONV loadData(UErrorCode &status);
652 static UBool U_CALLCONV cleanup();
653
654 static UInitOnce gInitOnce;
655
656 private:
AliasData(CharStringMap languageMap,CharStringMap scriptMap,CharStringMap territoryMap,CharStringMap variantMap,CharString * strings)657 AliasData(CharStringMap languageMap,
658 CharStringMap scriptMap,
659 CharStringMap territoryMap,
660 CharStringMap variantMap,
661 CharString* strings)
662 : language(std::move(languageMap)),
663 script(std::move(scriptMap)),
664 territory(std::move(territoryMap)),
665 variant(std::move(variantMap)),
666 strings(strings) {
667 }
668
~AliasData()669 ~AliasData() {
670 delete strings;
671 }
672
673 static const AliasData* gSingleton;
674
675 CharStringMap language;
676 CharStringMap script;
677 CharStringMap territory;
678 CharStringMap variant;
679 CharString* strings;
680
681 friend class AliasDataBuilder;
682 };
683
684
685 const AliasData* AliasData::gSingleton = nullptr;
686 UInitOnce AliasData::gInitOnce = U_INITONCE_INITIALIZER;
687
688 UBool U_CALLCONV
cleanup()689 AliasData::cleanup()
690 {
691 gInitOnce.reset();
692 delete gSingleton;
693 return TRUE;
694 }
695
696 void
readAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,void (* checkType)(const char * type),void (* checkReplacement)(const UnicodeString & replacement),UErrorCode & status)697 AliasDataBuilder::readAlias(
698 UResourceBundle* alias,
699 UniqueCharStrings* strings,
700 LocalMemory<const char*>& types,
701 LocalMemory<int32_t>& replacementIndexes,
702 int32_t &length,
703 void (*checkType)(const char* type),
704 void (*checkReplacement)(const UnicodeString& replacement),
705 UErrorCode &status) {
706 if (U_FAILURE(status)) {
707 return;
708 }
709 length = ures_getSize(alias);
710 const char** rawTypes = types.allocateInsteadAndCopy(length);
711 if (rawTypes == nullptr) {
712 status = U_MEMORY_ALLOCATION_ERROR;
713 return;
714 }
715 int32_t* rawIndexes = replacementIndexes.allocateInsteadAndCopy(length);
716 if (rawIndexes == nullptr) {
717 status = U_MEMORY_ALLOCATION_ERROR;
718 return;
719 }
720 int i = 0;
721 while (ures_hasNext(alias)) {
722 LocalUResourceBundlePointer res(
723 ures_getNextResource(alias, nullptr, &status));
724 const char* aliasFrom = ures_getKey(res.getAlias());
725 UnicodeString aliasTo =
726 ures_getUnicodeStringByKey(res.getAlias(), "replacement", &status);
727
728 checkType(aliasFrom);
729 checkReplacement(aliasTo);
730
731 rawTypes[i] = aliasFrom;
732 rawIndexes[i] = strings->add(aliasTo, status);
733 i++;
734 }
735 }
736
737 /**
738 * Read the languageAlias data from alias to strings+types+replacementIndexes.
739 * Allocate length items for types, to store the type field. Allocate length
740 * items for replacementIndexes, to store the index in the strings for the
741 * replacement language.
742 */
743 void
readLanguageAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)744 AliasDataBuilder::readLanguageAlias(
745 UResourceBundle* alias,
746 UniqueCharStrings* strings,
747 LocalMemory<const char*>& types,
748 LocalMemory<int32_t>& replacementIndexes,
749 int32_t &length,
750 UErrorCode &status)
751 {
752 return readAlias(
753 alias, strings, types, replacementIndexes, length,
754 #if U_DEBUG
755 [](const char* type) {
756 // Assert the aliasFrom only contains the following possibilties
757 // language_REGION_variant
758 // language_REGION
759 // language_variant
760 // language
761 // und_variant
762 Locale test(type);
763 // Assert no script in aliasFrom
764 U_ASSERT(test.getScript()[0] == '\0');
765 // Assert when language is und, no REGION in aliasFrom.
766 U_ASSERT(test.getLanguage()[0] != '\0' || test.getCountry()[0] == '\0');
767 },
768 #else
769 [](const char*) {},
770 #endif
771 [](const UnicodeString&) {}, status);
772 }
773
774 /**
775 * Read the scriptAlias data from alias to strings+types+replacementIndexes.
776 * Allocate length items for types, to store the type field. Allocate length
777 * items for replacementIndexes, to store the index in the strings for the
778 * replacement script.
779 */
780 void
readScriptAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)781 AliasDataBuilder::readScriptAlias(
782 UResourceBundle* alias,
783 UniqueCharStrings* strings,
784 LocalMemory<const char*>& types,
785 LocalMemory<int32_t>& replacementIndexes,
786 int32_t &length,
787 UErrorCode &status)
788 {
789 return readAlias(
790 alias, strings, types, replacementIndexes, length,
791 #if U_DEBUG
792 [](const char* type) {
793 U_ASSERT(uprv_strlen(type) == 4);
794 },
795 [](const UnicodeString& replacement) {
796 U_ASSERT(replacement.length() == 4);
797 },
798 #else
799 [](const char*) {},
800 [](const UnicodeString&) { },
801 #endif
802 status);
803 }
804
805 /**
806 * Read the territoryAlias data from alias to strings+types+replacementIndexes.
807 * Allocate length items for types, to store the type field. Allocate length
808 * items for replacementIndexes, to store the index in the strings for the
809 * replacement regions.
810 */
811 void
readTerritoryAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)812 AliasDataBuilder::readTerritoryAlias(
813 UResourceBundle* alias,
814 UniqueCharStrings* strings,
815 LocalMemory<const char*>& types,
816 LocalMemory<int32_t>& replacementIndexes,
817 int32_t &length,
818 UErrorCode &status)
819 {
820 return readAlias(
821 alias, strings, types, replacementIndexes, length,
822 #if U_DEBUG
823 [](const char* type) {
824 U_ASSERT(uprv_strlen(type) == 2 || uprv_strlen(type) == 3);
825 },
826 #else
827 [](const char*) {},
828 #endif
829 [](const UnicodeString&) { },
830 status);
831 }
832
833 /**
834 * Read the variantAlias data from alias to strings+types+replacementIndexes.
835 * Allocate length items for types, to store the type field. Allocate length
836 * items for replacementIndexes, to store the index in the strings for the
837 * replacement variant.
838 */
839 void
readVariantAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)840 AliasDataBuilder::readVariantAlias(
841 UResourceBundle* alias,
842 UniqueCharStrings* strings,
843 LocalMemory<const char*>& types,
844 LocalMemory<int32_t>& replacementIndexes,
845 int32_t &length,
846 UErrorCode &status)
847 {
848 return readAlias(
849 alias, strings, types, replacementIndexes, length,
850 #if U_DEBUG
851 [](const char* type) {
852 U_ASSERT(uprv_strlen(type) >= 4 && uprv_strlen(type) <= 8);
853 U_ASSERT(uprv_strlen(type) != 4 ||
854 (type[0] >= '0' && type[0] <= '9'));
855 },
856 [](const UnicodeString& replacement) {
857 U_ASSERT(replacement.length() >= 4 && replacement.length() <= 8);
858 U_ASSERT(replacement.length() != 4 ||
859 (replacement.charAt(0) >= u'0' &&
860 replacement.charAt(0) <= u'9'));
861 },
862 #else
863 [](const char*) {},
864 [](const UnicodeString&) { },
865 #endif
866 status);
867 }
868
869 /**
870 * Initializes the alias data from the ICU resource bundles. The alias data
871 * contains alias of language, country, script and variants.
872 *
873 * If the alias data has already loaded, then this method simply returns without
874 * doing anything meaningful.
875 */
876 void U_CALLCONV
loadData(UErrorCode & status)877 AliasData::loadData(UErrorCode &status)
878 {
879 #ifdef LOCALE_CANONICALIZATION_DEBUG
880 UDate start = uprv_getRawUTCtime();
881 #endif // LOCALE_CANONICALIZATION_DEBUG
882 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_ALIAS, cleanup);
883 AliasDataBuilder builder;
884 gSingleton = builder.build(status);
885 #ifdef LOCALE_CANONICALIZATION_DEBUG
886 UDate end = uprv_getRawUTCtime();
887 printf("AliasData::loadData took total %f ms\n", end - start);
888 #endif // LOCALE_CANONICALIZATION_DEBUG
889 }
890
891 /**
892 * Build the alias data from resources.
893 */
894 AliasData*
build(UErrorCode & status)895 AliasDataBuilder::build(UErrorCode &status) {
896 LocalUResourceBundlePointer metadata(
897 ures_openDirect(nullptr, "metadata", &status));
898 LocalUResourceBundlePointer metadataAlias(
899 ures_getByKey(metadata.getAlias(), "alias", nullptr, &status));
900 LocalUResourceBundlePointer languageAlias(
901 ures_getByKey(metadataAlias.getAlias(), "language", nullptr, &status));
902 LocalUResourceBundlePointer scriptAlias(
903 ures_getByKey(metadataAlias.getAlias(), "script", nullptr, &status));
904 LocalUResourceBundlePointer territoryAlias(
905 ures_getByKey(metadataAlias.getAlias(), "territory", nullptr, &status));
906 LocalUResourceBundlePointer variantAlias(
907 ures_getByKey(metadataAlias.getAlias(), "variant", nullptr, &status));
908
909 if (U_FAILURE(status)) {
910 return nullptr;
911 }
912 int32_t languagesLength = 0, scriptLength = 0, territoryLength = 0,
913 variantLength = 0;
914
915 // Read the languageAlias into languageTypes, languageReplacementIndexes
916 // and strings
917 UniqueCharStrings strings(status);
918 LocalMemory<const char*> languageTypes;
919 LocalMemory<int32_t> languageReplacementIndexes;
920 readLanguageAlias(languageAlias.getAlias(),
921 &strings,
922 languageTypes,
923 languageReplacementIndexes,
924 languagesLength,
925 status);
926
927 // Read the scriptAlias into scriptTypes, scriptReplacementIndexes
928 // and strings
929 LocalMemory<const char*> scriptTypes;
930 LocalMemory<int32_t> scriptReplacementIndexes;
931 readScriptAlias(scriptAlias.getAlias(),
932 &strings,
933 scriptTypes,
934 scriptReplacementIndexes,
935 scriptLength,
936 status);
937
938 // Read the territoryAlias into territoryTypes, territoryReplacementIndexes
939 // and strings
940 LocalMemory<const char*> territoryTypes;
941 LocalMemory<int32_t> territoryReplacementIndexes;
942 readTerritoryAlias(territoryAlias.getAlias(),
943 &strings,
944 territoryTypes,
945 territoryReplacementIndexes,
946 territoryLength, status);
947
948 // Read the variantAlias into variantTypes, variantReplacementIndexes
949 // and strings
950 LocalMemory<const char*> variantTypes;
951 LocalMemory<int32_t> variantReplacementIndexes;
952 readVariantAlias(variantAlias.getAlias(),
953 &strings,
954 variantTypes,
955 variantReplacementIndexes,
956 variantLength, status);
957
958 if (U_FAILURE(status)) {
959 return nullptr;
960 }
961
962 // We can only use strings after freeze it.
963 strings.freeze();
964
965 // Build the languageMap from languageTypes & languageReplacementIndexes
966 CharStringMap languageMap(490, status);
967 for (int32_t i = 0; U_SUCCESS(status) && i < languagesLength; i++) {
968 languageMap.put(languageTypes[i],
969 strings.get(languageReplacementIndexes[i]),
970 status);
971 }
972
973 // Build the scriptMap from scriptTypes & scriptReplacementIndexes
974 CharStringMap scriptMap(1, status);
975 for (int32_t i = 0; U_SUCCESS(status) && i < scriptLength; i++) {
976 scriptMap.put(scriptTypes[i],
977 strings.get(scriptReplacementIndexes[i]),
978 status);
979 }
980
981 // Build the territoryMap from territoryTypes & territoryReplacementIndexes
982 CharStringMap territoryMap(650, status);
983 for (int32_t i = 0; U_SUCCESS(status) && i < territoryLength; i++) {
984 territoryMap.put(territoryTypes[i],
985 strings.get(territoryReplacementIndexes[i]),
986 status);
987 }
988
989 // Build the variantMap from variantTypes & variantReplacementIndexes.
990 CharStringMap variantMap(2, status);
991 for (int32_t i = 0; U_SUCCESS(status) && i < variantLength; i++) {
992 variantMap.put(variantTypes[i],
993 strings.get(variantReplacementIndexes[i]),
994 status);
995 }
996
997 if (U_FAILURE(status)) {
998 return nullptr;
999 }
1000
1001 // copy hashtables
1002 auto *data = new AliasData(
1003 std::move(languageMap),
1004 std::move(scriptMap),
1005 std::move(territoryMap),
1006 std::move(variantMap),
1007 strings.orphanCharStrings());
1008
1009 if (data == nullptr) {
1010 status = U_MEMORY_ALLOCATION_ERROR;
1011 }
1012 return data;
1013 }
1014
1015 /**
1016 * A class that find the replacement values of locale fields by using AliasData.
1017 */
1018 class AliasReplacer {
1019 public:
AliasReplacer(UErrorCode status)1020 AliasReplacer(UErrorCode status) :
1021 language(nullptr), script(nullptr), region(nullptr),
1022 extensions(nullptr), variants(status),
1023 data(nullptr) {
1024 }
~AliasReplacer()1025 ~AliasReplacer() {
1026 }
1027
1028 // Check the fields inside locale, if need to replace fields,
1029 // place the the replaced locale ID in out and return true.
1030 // Otherwise return false for no replacement or error.
1031 bool replace(
1032 const Locale& locale, CharString& out, UErrorCode& status);
1033
1034 private:
1035 const char* language;
1036 const char* script;
1037 const char* region;
1038 const char* extensions;
1039 UVector variants;
1040
1041 const AliasData* data;
1042
notEmpty(const char * str)1043 inline bool notEmpty(const char* str) {
1044 return str && str[0] != NULL_CHAR;
1045 }
1046
1047 /**
1048 * If replacement is neither null nor empty and input is either null or empty,
1049 * return replacement.
1050 * If replacement is neither null nor empty but input is not empty, return input.
1051 * If replacement is either null or empty and type is either null or empty,
1052 * return input.
1053 * Otherwise return null.
1054 * replacement input type return
1055 * AAA nullptr * AAA
1056 * AAA BBB * BBB
1057 * nullptr || "" CCC nullptr CCC
1058 * nullptr || "" * DDD nullptr
1059 */
deleteOrReplace(const char * input,const char * type,const char * replacement)1060 inline const char* deleteOrReplace(
1061 const char* input, const char* type, const char* replacement) {
1062 return notEmpty(replacement) ?
1063 ((input == nullptr) ? replacement : input) :
1064 ((type == nullptr) ? input : nullptr);
1065 }
1066
same(const char * a,const char * b)1067 inline bool same(const char* a, const char* b) {
1068 if (a == nullptr && b == nullptr) {
1069 return true;
1070 }
1071 if ((a == nullptr && b != nullptr) ||
1072 (a != nullptr && b == nullptr)) {
1073 return false;
1074 }
1075 return uprv_strcmp(a, b) == 0;
1076 }
1077
1078 // Gather fields and generate locale ID into out.
1079 CharString& outputToString(CharString& out, UErrorCode status);
1080
1081 // Generate the lookup key.
1082 CharString& generateKey(const char* language, const char* region,
1083 const char* variant, CharString& out,
1084 UErrorCode status);
1085
1086 void parseLanguageReplacement(const char* replacement,
1087 const char*& replaceLanguage,
1088 const char*& replaceScript,
1089 const char*& replaceRegion,
1090 const char*& replaceVariant,
1091 const char*& replaceExtensions,
1092 UVector& toBeFreed,
1093 UErrorCode& status);
1094
1095 // Replace by using languageAlias.
1096 bool replaceLanguage(bool checkLanguage, bool checkRegion,
1097 bool checkVariants, UVector& toBeFreed,
1098 UErrorCode& status);
1099
1100 // Replace by using territoryAlias.
1101 bool replaceTerritory(UVector& toBeFreed, UErrorCode& status);
1102
1103 // Replace by using scriptAlias.
1104 bool replaceScript(UErrorCode& status);
1105
1106 // Replace by using variantAlias.
1107 bool replaceVariant(UErrorCode& status);
1108 };
1109
1110 CharString&
generateKey(const char * language,const char * region,const char * variant,CharString & out,UErrorCode status)1111 AliasReplacer::generateKey(
1112 const char* language, const char* region, const char* variant,
1113 CharString& out, UErrorCode status)
1114 {
1115 out.append(language, status);
1116 if (notEmpty(region)) {
1117 out.append(SEP_CHAR, status)
1118 .append(region, status);
1119 }
1120 if (notEmpty(variant)) {
1121 out.append(SEP_CHAR, status)
1122 .append(variant, status);
1123 }
1124 return out;
1125 }
1126
1127 void
parseLanguageReplacement(const char * replacement,const char * & replacedLanguage,const char * & replacedScript,const char * & replacedRegion,const char * & replacedVariant,const char * & replacedExtensions,UVector & toBeFreed,UErrorCode & status)1128 AliasReplacer::parseLanguageReplacement(
1129 const char* replacement,
1130 const char*& replacedLanguage,
1131 const char*& replacedScript,
1132 const char*& replacedRegion,
1133 const char*& replacedVariant,
1134 const char*& replacedExtensions,
1135 UVector& toBeFreed,
1136 UErrorCode& status)
1137 {
1138 if (U_FAILURE(status)) {
1139 return;
1140 }
1141 replacedScript = replacedRegion = replacedVariant
1142 = replacedExtensions = nullptr;
1143 if (uprv_strchr(replacement, '_') == nullptr) {
1144 replacedLanguage = replacement;
1145 // reach the end, just return it.
1146 return;
1147 }
1148 // We have multiple field so we have to allocate and parse
1149 CharString* str = new CharString(
1150 replacement, (int32_t)uprv_strlen(replacement), status);
1151 if (U_FAILURE(status)) {
1152 return;
1153 }
1154 if (str == nullptr) {
1155 status = U_MEMORY_ALLOCATION_ERROR;
1156 return;
1157 }
1158 toBeFreed.addElement(str, status);
1159 char* data = str->data();
1160 replacedLanguage = (const char*) data;
1161 char* endOfField = uprv_strchr(data, '_');
1162 *endOfField = '\0'; // null terminiate it.
1163 endOfField++;
1164 const char* start = endOfField;
1165 endOfField = (char*) uprv_strchr(start, '_');
1166 size_t len = 0;
1167 if (endOfField == nullptr) {
1168 len = uprv_strlen(start);
1169 } else {
1170 len = endOfField - start;
1171 *endOfField = '\0'; // null terminiate it.
1172 }
1173 if (len == 4 && uprv_isASCIILetter(*start)) {
1174 // Got a script
1175 replacedScript = start;
1176 if (endOfField == nullptr) {
1177 return;
1178 }
1179 start = endOfField++;
1180 endOfField = (char*)uprv_strchr(start, '_');
1181 if (endOfField == nullptr) {
1182 len = uprv_strlen(start);
1183 } else {
1184 len = endOfField - start;
1185 *endOfField = '\0'; // null terminiate it.
1186 }
1187 }
1188 if (len >= 2 && len <= 3) {
1189 // Got a region
1190 replacedRegion = start;
1191 if (endOfField == nullptr) {
1192 return;
1193 }
1194 start = endOfField++;
1195 endOfField = (char*)uprv_strchr(start, '_');
1196 if (endOfField == nullptr) {
1197 len = uprv_strlen(start);
1198 } else {
1199 len = endOfField - start;
1200 *endOfField = '\0'; // null terminiate it.
1201 }
1202 }
1203 if (len >= 4) {
1204 // Got a variant
1205 replacedVariant = start;
1206 if (endOfField == nullptr) {
1207 return;
1208 }
1209 start = endOfField++;
1210 }
1211 replacedExtensions = start;
1212 }
1213
1214 bool
replaceLanguage(bool checkLanguage,bool checkRegion,bool checkVariants,UVector & toBeFreed,UErrorCode & status)1215 AliasReplacer::replaceLanguage(
1216 bool checkLanguage, bool checkRegion,
1217 bool checkVariants, UVector& toBeFreed, UErrorCode& status)
1218 {
1219 if (U_FAILURE(status)) {
1220 return false;
1221 }
1222 if ( (checkRegion && region == nullptr) ||
1223 (checkVariants && variants.size() == 0)) {
1224 // Nothing to search.
1225 return false;
1226 }
1227 int32_t variant_size = checkVariants ? variants.size() : 1;
1228 // Since we may have more than one variant, we need to loop through them.
1229 const char* searchLanguage = checkLanguage ? language : "und";
1230 const char* searchRegion = checkRegion ? region : nullptr;
1231 const char* searchVariant = nullptr;
1232 for (int32_t variant_index = 0;
1233 variant_index < variant_size;
1234 variant_index++) {
1235 if (checkVariants) {
1236 U_ASSERT(variant_index < variant_size);
1237 searchVariant = (const char*)(variants.elementAt(variant_index));
1238 }
1239
1240 if (searchVariant != nullptr && uprv_strlen(searchVariant) < 4) {
1241 // Do not consider ill-formed variant subtag.
1242 searchVariant = nullptr;
1243 }
1244 CharString typeKey;
1245 generateKey(searchLanguage, searchRegion, searchVariant, typeKey,
1246 status);
1247 if (U_FAILURE(status)) {
1248 return false;
1249 }
1250 const char *replacement = data->languageMap().get(typeKey.data());
1251 if (replacement == nullptr) {
1252 // Found no replacement data.
1253 continue;
1254 }
1255
1256 const char* replacedLanguage = nullptr;
1257 const char* replacedScript = nullptr;
1258 const char* replacedRegion = nullptr;
1259 const char* replacedVariant = nullptr;
1260 const char* replacedExtensions = nullptr;
1261 parseLanguageReplacement(replacement,
1262 replacedLanguage,
1263 replacedScript,
1264 replacedRegion,
1265 replacedVariant,
1266 replacedExtensions,
1267 toBeFreed,
1268 status);
1269 replacedLanguage =
1270 (replacedLanguage != nullptr && uprv_strcmp(replacedLanguage, "und") == 0) ?
1271 language : replacedLanguage;
1272 replacedScript = deleteOrReplace(script, nullptr, replacedScript);
1273 replacedRegion = deleteOrReplace(region, searchRegion, replacedRegion);
1274 replacedVariant = deleteOrReplace(
1275 searchVariant, searchVariant, replacedVariant);
1276
1277 if ( same(language, replacedLanguage) &&
1278 same(script, replacedScript) &&
1279 same(region, replacedRegion) &&
1280 same(searchVariant, replacedVariant) &&
1281 replacedExtensions == nullptr) {
1282 // Replacement produce no changes.
1283 continue;
1284 }
1285
1286 language = replacedLanguage;
1287 region = replacedRegion;
1288 script = replacedScript;
1289 if (searchVariant != nullptr) {
1290 if (notEmpty(replacedVariant)) {
1291 variants.setElementAt((void*)replacedVariant, variant_index);
1292 } else {
1293 variants.removeElementAt(variant_index);
1294 }
1295 }
1296 if (replacedExtensions != nullptr) {
1297 // TODO(ICU-21292)
1298 // DO NOTHING
1299 // UTS35 does not specifiy what should we do if we have extensions in the
1300 // replacement. Currently we know only the following 4 "BCP47 LegacyRules" have
1301 // extensions in them languageAlias:
1302 // i_default => en_x_i_default
1303 // i_enochian => und_x_i_enochian
1304 // i_mingo => see_x_i_mingo
1305 // zh_min => nan_x_zh_min
1306 // But all of them are already changed by code inside ultag_parse() before
1307 // hitting this code.
1308 }
1309
1310 // Something changed by language alias data.
1311 return true;
1312 }
1313 // Nothing changed by language alias data.
1314 return false;
1315 }
1316
1317 bool
replaceTerritory(UVector & toBeFreed,UErrorCode & status)1318 AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status)
1319 {
1320 if (U_FAILURE(status)) {
1321 return false;
1322 }
1323 if (region == nullptr) {
1324 // No region to search.
1325 return false;
1326 }
1327 const char *replacement = data->territoryMap().get(region);
1328 if (replacement == nullptr) {
1329 // Found no replacement data for this region.
1330 return false;
1331 }
1332 const char* replacedRegion = replacement;
1333 const char* firstSpace = uprv_strchr(replacement, ' ');
1334 if (firstSpace != nullptr) {
1335 // If there are are more than one region in the replacement.
1336 // We need to check which one match based on the language.
1337 // Cannot use nullptr for language because that will construct
1338 // the default locale, in that case, use "und" to get the correct
1339 // locale.
1340 Locale l = LocaleBuilder()
1341 .setLanguage(language == nullptr ? "und" : language)
1342 .setScript(script)
1343 .build(status);
1344 l.addLikelySubtags(status);
1345 const char* likelyRegion = l.getCountry();
1346 LocalPointer<CharString> item;
1347 if (likelyRegion != nullptr && uprv_strlen(likelyRegion) > 0) {
1348 size_t len = uprv_strlen(likelyRegion);
1349 const char* foundInReplacement = uprv_strstr(replacement,
1350 likelyRegion);
1351 if (foundInReplacement != nullptr) {
1352 // Assuming the case there are no three letter region code in
1353 // the replacement of territoryAlias
1354 U_ASSERT(foundInReplacement == replacement ||
1355 *(foundInReplacement-1) == ' ');
1356 U_ASSERT(foundInReplacement[len] == ' ' ||
1357 foundInReplacement[len] == '\0');
1358 item.adoptInsteadAndCheckErrorCode(
1359 new CharString(foundInReplacement, (int32_t)len, status), status);
1360 }
1361 }
1362 if (item.isNull() && U_SUCCESS(status)) {
1363 item.adoptInsteadAndCheckErrorCode(
1364 new CharString(replacement,
1365 (int32_t)(firstSpace - replacement), status), status);
1366 }
1367 if (U_FAILURE(status)) { return false; }
1368 if (item.isNull()) {
1369 status = U_MEMORY_ALLOCATION_ERROR;
1370 return false;
1371 }
1372 replacedRegion = item->data();
1373 toBeFreed.addElement(item.orphan(), status);
1374 }
1375 U_ASSERT(!same(region, replacedRegion));
1376 region = replacedRegion;
1377 // The region is changed by data in territory alias.
1378 return true;
1379 }
1380
1381 bool
replaceScript(UErrorCode & status)1382 AliasReplacer::replaceScript(UErrorCode& status)
1383 {
1384 if (U_FAILURE(status)) {
1385 return false;
1386 }
1387 if (script == nullptr) {
1388 // No script to search.
1389 return false;
1390 }
1391 const char *replacement = data->scriptMap().get(script);
1392 if (replacement == nullptr) {
1393 // Found no replacement data for this script.
1394 return false;
1395 }
1396 U_ASSERT(!same(script, replacement));
1397 script = replacement;
1398 // The script is changed by data in script alias.
1399 return true;
1400 }
1401
1402 bool
replaceVariant(UErrorCode & status)1403 AliasReplacer::replaceVariant(UErrorCode& status)
1404 {
1405 if (U_FAILURE(status)) {
1406 return false;
1407 }
1408 // Since we may have more than one variant, we need to loop through them.
1409 for (int32_t i = 0; i < variants.size(); i++) {
1410 const char *variant = (const char*)(variants.elementAt(i));
1411 const char *replacement = data->variantMap().get(variant);
1412 if (replacement == nullptr) {
1413 // Found no replacement data for this variant.
1414 continue;
1415 }
1416 U_ASSERT((uprv_strlen(replacement) >= 5 &&
1417 uprv_strlen(replacement) <= 8) ||
1418 (uprv_strlen(replacement) == 4 &&
1419 replacement[0] >= '0' &&
1420 replacement[0] <= '9'));
1421 if (!same(variant, replacement)) {
1422 variants.setElementAt((void*)replacement, i);
1423 // Special hack to handle hepburn-heploc => alalc97
1424 if (uprv_strcmp(variant, "heploc") == 0) {
1425 for (int32_t j = 0; j < variants.size(); j++) {
1426 if (uprv_strcmp((const char*)(variants.elementAt(j)),
1427 "hepburn") == 0) {
1428 variants.removeElementAt(j);
1429 }
1430 }
1431 }
1432 return true;
1433 }
1434 }
1435 return false;
1436 }
1437
1438 CharString&
outputToString(CharString & out,UErrorCode status)1439 AliasReplacer::outputToString(
1440 CharString& out, UErrorCode status)
1441 {
1442 out.append(language, status);
1443 if (notEmpty(script)) {
1444 out.append(SEP_CHAR, status)
1445 .append(script, status);
1446 }
1447 if (notEmpty(region)) {
1448 out.append(SEP_CHAR, status)
1449 .append(region, status);
1450 }
1451 if (variants.size() > 0) {
1452 if (!notEmpty(script) && !notEmpty(region)) {
1453 out.append(SEP_CHAR, status);
1454 }
1455 variants.sort([](UElement e1, UElement e2) -> int8_t {
1456 // uprv_strcmp return int and in some platform, such as arm64-v8a,
1457 // it may return positive values > 127 which cause the casted value
1458 // of int8_t negative.
1459 int res = uprv_strcmp(
1460 (const char*)e1.pointer, (const char*)e2.pointer);
1461 return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
1462 }, status);
1463 int32_t variantsStart = out.length();
1464 for (int32_t i = 0; i < variants.size(); i++) {
1465 out.append(SEP_CHAR, status)
1466 .append((const char*)(variants.elementAt(i)),
1467 status);
1468 }
1469 T_CString_toUpperCase(out.data() + variantsStart);
1470 }
1471 if (notEmpty(extensions)) {
1472 CharString tmp("und_", status);
1473 tmp.append(extensions, status);
1474 Locale tmpLocale(tmp.data());
1475 // only support x extension inside CLDR for now.
1476 U_ASSERT(extensions[0] == 'x');
1477 out.append(tmpLocale.getName() + 1, status);
1478 }
1479 return out;
1480 }
1481
1482 bool
replace(const Locale & locale,CharString & out,UErrorCode & status)1483 AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status)
1484 {
1485 data = AliasData::singleton(status);
1486 if (U_FAILURE(status)) {
1487 return false;
1488 }
1489 U_ASSERT(data != nullptr);
1490 out.clear();
1491 language = locale.getLanguage();
1492 if (!notEmpty(language)) {
1493 language = nullptr;
1494 }
1495 script = locale.getScript();
1496 if (!notEmpty(script)) {
1497 script = nullptr;
1498 }
1499 region = locale.getCountry();
1500 if (!notEmpty(region)) {
1501 region = nullptr;
1502 }
1503 const char* variantsStr = locale.getVariant();
1504 const char* extensionsStr = locale_getKeywordsStart(locale.getName());
1505 CharString variantsBuff(variantsStr, -1, status);
1506 if (!variantsBuff.isEmpty()) {
1507 if (U_FAILURE(status)) { return false; }
1508 char* start = variantsBuff.data();
1509 T_CString_toLowerCase(start);
1510 char* end;
1511 while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr &&
1512 U_SUCCESS(status)) {
1513 *end = NULL_CHAR; // null terminate inside variantsBuff
1514 variants.addElement(start, status);
1515 start = end + 1;
1516 }
1517 variants.addElement(start, status);
1518 }
1519 if (U_FAILURE(status)) { return false; }
1520
1521 // Sort the variants
1522 variants.sort([](UElement e1, UElement e2) -> int8_t {
1523 // uprv_strcmp return int and in some platform, such as arm64-v8a,
1524 // it may return positive values > 127 which cause the casted value
1525 // of int8_t negative.
1526 int res = uprv_strcmp(
1527 (const char*)e1.pointer, (const char*)e2.pointer);
1528 return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
1529 }, status);
1530
1531 // A changed count to assert when loop too many times.
1532 int changed = 0;
1533 // A UVector to to hold CharString allocated by the replace* method
1534 // and freed when out of scope from his function.
1535 UVector stringsToBeFreed([](void *obj){ delete ((CharString*) obj); },
1536 nullptr, 10, status);
1537 while (U_SUCCESS(status)) {
1538 // Something wrong with the data cause looping here more than 10 times
1539 // already.
1540 U_ASSERT(changed < 5);
1541 // From observation of key in data/misc/metadata.txt
1542 // we know currently we only need to search in the following combination
1543 // of fields for type in languageAlias:
1544 // * lang_region_variant
1545 // * lang_region
1546 // * lang_variant
1547 // * lang
1548 // * und_variant
1549 // This assumption is ensured by the U_ASSERT in readLanguageAlias
1550 //
1551 // lang REGION variant
1552 if ( replaceLanguage(true, true, true, stringsToBeFreed, status) ||
1553 replaceLanguage(true, true, false, stringsToBeFreed, status) ||
1554 replaceLanguage(true, false, true, stringsToBeFreed, status) ||
1555 replaceLanguage(true, false, false, stringsToBeFreed, status) ||
1556 replaceLanguage(false,false, true, stringsToBeFreed, status) ||
1557 replaceTerritory(stringsToBeFreed, status) ||
1558 replaceScript(status) ||
1559 replaceVariant(status)) {
1560 // Some values in data is changed, try to match from the beginning
1561 // again.
1562 changed++;
1563 continue;
1564 }
1565 // Nothing changed. Break out.
1566 break;
1567 } // while(1)
1568
1569 if (U_FAILURE(status)) { return false; }
1570 // Nothing changed and we know the order of the vaiants are not change
1571 // because we have no variant or only one.
1572 if (changed == 0 && variants.size() <= 1) {
1573 return false;
1574 }
1575 outputToString(out, status);
1576 if (extensionsStr != nullptr) {
1577 out.append(extensionsStr, status);
1578 }
1579 if (U_FAILURE(status)) {
1580 return false;
1581 }
1582 // If the tag is not changed, return.
1583 if (uprv_strcmp(out.data(), locale.getName()) == 0) {
1584 U_ASSERT(changed == 0);
1585 U_ASSERT(variants.size() > 1);
1586 out.clear();
1587 return false;
1588 }
1589 return true;
1590 }
1591
1592 // Return true if the locale is changed during canonicalization.
1593 // The replaced value then will be put into out.
1594 bool
canonicalizeLocale(const Locale & locale,CharString & out,UErrorCode & status)1595 canonicalizeLocale(const Locale& locale, CharString& out, UErrorCode& status)
1596 {
1597 AliasReplacer replacer(status);
1598 return replacer.replace(locale, out, status);
1599 }
1600
1601 // Function to optimize for known cases without so we can skip the loading
1602 // of resources in the startup time until we really need it.
1603 bool
isKnownCanonicalizedLocale(const char * locale,UErrorCode & status)1604 isKnownCanonicalizedLocale(const char* locale, UErrorCode& status)
1605 {
1606 if ( uprv_strcmp(locale, "c") == 0 ||
1607 uprv_strcmp(locale, "en") == 0 ||
1608 uprv_strcmp(locale, "en_US") == 0) {
1609 return true;
1610 }
1611
1612 // common well-known Canonicalized.
1613 umtx_initOnce(gKnownCanonicalizedInitOnce,
1614 &loadKnownCanonicalized, status);
1615 if (U_FAILURE(status)) {
1616 return false;
1617 }
1618 U_ASSERT(gKnownCanonicalized != nullptr);
1619 return uhash_geti(gKnownCanonicalized, locale) != 0;
1620 }
1621
1622 } // namespace
1623
1624 // Function for testing.
1625 U_CAPI const char* const*
ulocimp_getKnownCanonicalizedLocaleForTest(int32_t * length)1626 ulocimp_getKnownCanonicalizedLocaleForTest(int32_t* length)
1627 {
1628 *length = UPRV_LENGTHOF(KNOWN_CANONICALIZED);
1629 return KNOWN_CANONICALIZED;
1630 }
1631
1632 // Function for testing.
1633 U_CAPI bool
ulocimp_isCanonicalizedLocaleForTest(const char * localeName)1634 ulocimp_isCanonicalizedLocaleForTest(const char* localeName)
1635 {
1636 Locale l(localeName);
1637 UErrorCode status = U_ZERO_ERROR;
1638 CharString temp;
1639 return !canonicalizeLocale(l, temp, status) && U_SUCCESS(status);
1640 }
1641
1642 /*This function initializes a Locale from a C locale ID*/
init(const char * localeID,UBool canonicalize)1643 Locale& Locale::init(const char* localeID, UBool canonicalize)
1644 {
1645 fIsBogus = FALSE;
1646 /* Free our current storage */
1647 if (baseName != fullName) {
1648 uprv_free(baseName);
1649 }
1650 baseName = NULL;
1651 if(fullName != fullNameBuffer) {
1652 uprv_free(fullName);
1653 fullName = fullNameBuffer;
1654 }
1655
1656 // not a loop:
1657 // just an easy way to have a common error-exit
1658 // without goto and without another function
1659 do {
1660 char *separator;
1661 char *field[5] = {0};
1662 int32_t fieldLen[5] = {0};
1663 int32_t fieldIdx;
1664 int32_t variantField;
1665 int32_t length;
1666 UErrorCode err;
1667
1668 if(localeID == NULL) {
1669 // not an error, just set the default locale
1670 return *this = getDefault();
1671 }
1672
1673 /* preset all fields to empty */
1674 language[0] = script[0] = country[0] = 0;
1675
1676 // "canonicalize" the locale ID to ICU/Java format
1677 err = U_ZERO_ERROR;
1678 length = canonicalize ?
1679 uloc_canonicalize(localeID, fullName, sizeof(fullNameBuffer), &err) :
1680 uloc_getName(localeID, fullName, sizeof(fullNameBuffer), &err);
1681
1682 if(err == U_BUFFER_OVERFLOW_ERROR || length >= (int32_t)sizeof(fullNameBuffer)) {
1683 /*Go to heap for the fullName if necessary*/
1684 fullName = (char *)uprv_malloc(sizeof(char)*(length + 1));
1685 if(fullName == 0) {
1686 fullName = fullNameBuffer;
1687 break; // error: out of memory
1688 }
1689 err = U_ZERO_ERROR;
1690 length = canonicalize ?
1691 uloc_canonicalize(localeID, fullName, length+1, &err) :
1692 uloc_getName(localeID, fullName, length+1, &err);
1693 }
1694 if(U_FAILURE(err) || err == U_STRING_NOT_TERMINATED_WARNING) {
1695 /* should never occur */
1696 break;
1697 }
1698
1699 variantBegin = length;
1700
1701 /* after uloc_getName/canonicalize() we know that only '_' are separators */
1702 /* But _ could also appeared in timezone such as "en@timezone=America/Los_Angeles" */
1703 separator = field[0] = fullName;
1704 fieldIdx = 1;
1705 char* at = uprv_strchr(fullName, '@');
1706 while ((separator = uprv_strchr(field[fieldIdx-1], SEP_CHAR)) != 0 &&
1707 fieldIdx < UPRV_LENGTHOF(field)-1 &&
1708 (at == nullptr || separator < at)) {
1709 field[fieldIdx] = separator + 1;
1710 fieldLen[fieldIdx-1] = (int32_t)(separator - field[fieldIdx-1]);
1711 fieldIdx++;
1712 }
1713 // variant may contain @foo or .foo POSIX cruft; remove it
1714 separator = uprv_strchr(field[fieldIdx-1], '@');
1715 char* sep2 = uprv_strchr(field[fieldIdx-1], '.');
1716 if (separator!=NULL || sep2!=NULL) {
1717 if (separator==NULL || (sep2!=NULL && separator > sep2)) {
1718 separator = sep2;
1719 }
1720 fieldLen[fieldIdx-1] = (int32_t)(separator - field[fieldIdx-1]);
1721 } else {
1722 fieldLen[fieldIdx-1] = length - (int32_t)(field[fieldIdx-1] - fullName);
1723 }
1724
1725 if (fieldLen[0] >= (int32_t)(sizeof(language)))
1726 {
1727 break; // error: the language field is too long
1728 }
1729
1730 variantField = 1; /* Usually the 2nd one, except when a script or country is also used. */
1731 if (fieldLen[0] > 0) {
1732 /* We have a language */
1733 uprv_memcpy(language, fullName, fieldLen[0]);
1734 language[fieldLen[0]] = 0;
1735 }
1736 if (fieldLen[1] == 4 && uprv_isASCIILetter(field[1][0]) &&
1737 uprv_isASCIILetter(field[1][1]) && uprv_isASCIILetter(field[1][2]) &&
1738 uprv_isASCIILetter(field[1][3])) {
1739 /* We have at least a script */
1740 uprv_memcpy(script, field[1], fieldLen[1]);
1741 script[fieldLen[1]] = 0;
1742 variantField++;
1743 }
1744
1745 if (fieldLen[variantField] == 2 || fieldLen[variantField] == 3) {
1746 /* We have a country */
1747 uprv_memcpy(country, field[variantField], fieldLen[variantField]);
1748 country[fieldLen[variantField]] = 0;
1749 variantField++;
1750 } else if (fieldLen[variantField] == 0) {
1751 variantField++; /* script or country empty but variant in next field (i.e. en__POSIX) */
1752 }
1753
1754 if (fieldLen[variantField] > 0) {
1755 /* We have a variant */
1756 variantBegin = (int32_t)(field[variantField] - fullName);
1757 }
1758
1759 err = U_ZERO_ERROR;
1760 initBaseName(err);
1761 if (U_FAILURE(err)) {
1762 break;
1763 }
1764
1765 if (canonicalize) {
1766 if (!isKnownCanonicalizedLocale(fullName, err)) {
1767 CharString replaced;
1768 // Not sure it is already canonicalized
1769 if (canonicalizeLocale(*this, replaced, err)) {
1770 U_ASSERT(U_SUCCESS(err));
1771 // If need replacement, call init again.
1772 init(replaced.data(), false);
1773 }
1774 if (U_FAILURE(err)) {
1775 break;
1776 }
1777 }
1778 } // if (canonicalize) {
1779
1780 // successful end of init()
1781 return *this;
1782 } while(0); /*loop doesn't iterate*/
1783
1784 // when an error occurs, then set this object to "bogus" (there is no UErrorCode here)
1785 setToBogus();
1786
1787 return *this;
1788 }
1789
1790 /*
1791 * Set up the base name.
1792 * If there are no key words, it's exactly the full name.
1793 * If key words exist, it's the full name truncated at the '@' character.
1794 * Need to set up both at init() and after setting a keyword.
1795 */
1796 void
initBaseName(UErrorCode & status)1797 Locale::initBaseName(UErrorCode &status) {
1798 if (U_FAILURE(status)) {
1799 return;
1800 }
1801 U_ASSERT(baseName==NULL || baseName==fullName);
1802 const char *atPtr = uprv_strchr(fullName, '@');
1803 const char *eqPtr = uprv_strchr(fullName, '=');
1804 if (atPtr && eqPtr && atPtr < eqPtr) {
1805 // Key words exist.
1806 int32_t baseNameLength = (int32_t)(atPtr - fullName);
1807 baseName = (char *)uprv_malloc(baseNameLength + 1);
1808 if (baseName == NULL) {
1809 status = U_MEMORY_ALLOCATION_ERROR;
1810 return;
1811 }
1812 uprv_strncpy(baseName, fullName, baseNameLength);
1813 baseName[baseNameLength] = 0;
1814
1815 // The original computation of variantBegin leaves it equal to the length
1816 // of fullName if there is no variant. It should instead be
1817 // the length of the baseName.
1818 if (variantBegin > baseNameLength) {
1819 variantBegin = baseNameLength;
1820 }
1821 } else {
1822 baseName = fullName;
1823 }
1824 }
1825
1826
1827 int32_t
hashCode() const1828 Locale::hashCode() const
1829 {
1830 return ustr_hashCharsN(fullName, static_cast<int32_t>(uprv_strlen(fullName)));
1831 }
1832
1833 void
setToBogus()1834 Locale::setToBogus() {
1835 /* Free our current storage */
1836 if(baseName != fullName) {
1837 uprv_free(baseName);
1838 }
1839 baseName = NULL;
1840 if(fullName != fullNameBuffer) {
1841 uprv_free(fullName);
1842 fullName = fullNameBuffer;
1843 }
1844 *fullNameBuffer = 0;
1845 *language = 0;
1846 *script = 0;
1847 *country = 0;
1848 fIsBogus = TRUE;
1849 variantBegin = 0;
1850 }
1851
1852 const Locale& U_EXPORT2
getDefault()1853 Locale::getDefault()
1854 {
1855 {
1856 Mutex lock(&gDefaultLocaleMutex);
1857 if (gDefaultLocale != NULL) {
1858 return *gDefaultLocale;
1859 }
1860 }
1861 UErrorCode status = U_ZERO_ERROR;
1862 return *locale_set_default_internal(NULL, status);
1863 }
1864
1865
1866
1867 void U_EXPORT2
setDefault(const Locale & newLocale,UErrorCode & status)1868 Locale::setDefault( const Locale& newLocale,
1869 UErrorCode& status)
1870 {
1871 if (U_FAILURE(status)) {
1872 return;
1873 }
1874
1875 /* Set the default from the full name string of the supplied locale.
1876 * This is a convenient way to access the default locale caching mechanisms.
1877 */
1878 const char *localeID = newLocale.getName();
1879 locale_set_default_internal(localeID, status);
1880 }
1881
1882 void
addLikelySubtags(UErrorCode & status)1883 Locale::addLikelySubtags(UErrorCode& status) {
1884 if (U_FAILURE(status)) {
1885 return;
1886 }
1887
1888 CharString maximizedLocaleID;
1889 {
1890 CharStringByteSink sink(&maximizedLocaleID);
1891 ulocimp_addLikelySubtags(fullName, sink, &status);
1892 }
1893
1894 if (U_FAILURE(status)) {
1895 return;
1896 }
1897
1898 init(maximizedLocaleID.data(), /*canonicalize=*/FALSE);
1899 if (isBogus()) {
1900 status = U_ILLEGAL_ARGUMENT_ERROR;
1901 }
1902 }
1903
1904 void
minimizeSubtags(UErrorCode & status)1905 Locale::minimizeSubtags(UErrorCode& status) {
1906 if (U_FAILURE(status)) {
1907 return;
1908 }
1909
1910 CharString minimizedLocaleID;
1911 {
1912 CharStringByteSink sink(&minimizedLocaleID);
1913 ulocimp_minimizeSubtags(fullName, sink, &status);
1914 }
1915
1916 if (U_FAILURE(status)) {
1917 return;
1918 }
1919
1920 init(minimizedLocaleID.data(), /*canonicalize=*/FALSE);
1921 if (isBogus()) {
1922 status = U_ILLEGAL_ARGUMENT_ERROR;
1923 }
1924 }
1925
1926 void
canonicalize(UErrorCode & status)1927 Locale::canonicalize(UErrorCode& status) {
1928 if (U_FAILURE(status)) {
1929 return;
1930 }
1931 if (isBogus()) {
1932 status = U_ILLEGAL_ARGUMENT_ERROR;
1933 return;
1934 }
1935 CharString uncanonicalized(fullName, status);
1936 if (U_FAILURE(status)) {
1937 return;
1938 }
1939 init(uncanonicalized.data(), /*canonicalize=*/TRUE);
1940 if (isBogus()) {
1941 status = U_ILLEGAL_ARGUMENT_ERROR;
1942 }
1943 }
1944
1945 Locale U_EXPORT2
forLanguageTag(StringPiece tag,UErrorCode & status)1946 Locale::forLanguageTag(StringPiece tag, UErrorCode& status)
1947 {
1948 Locale result(Locale::eBOGUS);
1949
1950 if (U_FAILURE(status)) {
1951 return result;
1952 }
1953
1954 // If a BCP 47 language tag is passed as the language parameter to the
1955 // normal Locale constructor, it will actually fall back to invoking
1956 // uloc_forLanguageTag() to parse it if it somehow is able to detect that
1957 // the string actually is BCP 47. This works well for things like strings
1958 // using BCP 47 extensions, but it does not at all work for things like
1959 // legacy language tags (marked as “Type: grandfathered” in BCP 47,
1960 // e.g., "en-GB-oed") which are possible to also
1961 // interpret as ICU locale IDs and because of that won't trigger the BCP 47
1962 // parsing. Therefore the code here explicitly calls uloc_forLanguageTag()
1963 // and then Locale::init(), instead of just calling the normal constructor.
1964
1965 CharString localeID;
1966 int32_t parsedLength;
1967 {
1968 CharStringByteSink sink(&localeID);
1969 ulocimp_forLanguageTag(
1970 tag.data(),
1971 tag.length(),
1972 sink,
1973 &parsedLength,
1974 &status);
1975 }
1976
1977 if (U_FAILURE(status)) {
1978 return result;
1979 }
1980
1981 if (parsedLength != tag.size()) {
1982 status = U_ILLEGAL_ARGUMENT_ERROR;
1983 return result;
1984 }
1985
1986 result.init(localeID.data(), /*canonicalize=*/FALSE);
1987 if (result.isBogus()) {
1988 status = U_ILLEGAL_ARGUMENT_ERROR;
1989 }
1990 return result;
1991 }
1992
1993 void
toLanguageTag(ByteSink & sink,UErrorCode & status) const1994 Locale::toLanguageTag(ByteSink& sink, UErrorCode& status) const
1995 {
1996 if (U_FAILURE(status)) {
1997 return;
1998 }
1999
2000 if (fIsBogus) {
2001 status = U_ILLEGAL_ARGUMENT_ERROR;
2002 return;
2003 }
2004
2005 ulocimp_toLanguageTag(fullName, sink, /*strict=*/FALSE, &status);
2006 }
2007
2008 Locale U_EXPORT2
createFromName(const char * name)2009 Locale::createFromName (const char *name)
2010 {
2011 if (name) {
2012 Locale l("");
2013 l.init(name, FALSE);
2014 return l;
2015 }
2016 else {
2017 return getDefault();
2018 }
2019 }
2020
2021 Locale U_EXPORT2
createCanonical(const char * name)2022 Locale::createCanonical(const char* name) {
2023 Locale loc("");
2024 loc.init(name, TRUE);
2025 return loc;
2026 }
2027
2028 const char *
getISO3Language() const2029 Locale::getISO3Language() const
2030 {
2031 return uloc_getISO3Language(fullName);
2032 }
2033
2034
2035 const char *
getISO3Country() const2036 Locale::getISO3Country() const
2037 {
2038 return uloc_getISO3Country(fullName);
2039 }
2040
2041 /**
2042 * Return the LCID value as specified in the "LocaleID" resource for this
2043 * locale. The LocaleID must be expressed as a hexadecimal number, from
2044 * one to four digits. If the LocaleID resource is not present, or is
2045 * in an incorrect format, 0 is returned. The LocaleID is for use in
2046 * Windows (it is an LCID), but is available on all platforms.
2047 */
2048 uint32_t
getLCID() const2049 Locale::getLCID() const
2050 {
2051 return uloc_getLCID(fullName);
2052 }
2053
getISOCountries()2054 const char* const* U_EXPORT2 Locale::getISOCountries()
2055 {
2056 return uloc_getISOCountries();
2057 }
2058
getISOLanguages()2059 const char* const* U_EXPORT2 Locale::getISOLanguages()
2060 {
2061 return uloc_getISOLanguages();
2062 }
2063
2064 // Set the locale's data based on a posix id.
setFromPOSIXID(const char * posixID)2065 void Locale::setFromPOSIXID(const char *posixID)
2066 {
2067 init(posixID, TRUE);
2068 }
2069
2070 const Locale & U_EXPORT2
getRoot(void)2071 Locale::getRoot(void)
2072 {
2073 return getLocale(eROOT);
2074 }
2075
2076 const Locale & U_EXPORT2
getEnglish(void)2077 Locale::getEnglish(void)
2078 {
2079 return getLocale(eENGLISH);
2080 }
2081
2082 const Locale & U_EXPORT2
getFrench(void)2083 Locale::getFrench(void)
2084 {
2085 return getLocale(eFRENCH);
2086 }
2087
2088 const Locale & U_EXPORT2
getGerman(void)2089 Locale::getGerman(void)
2090 {
2091 return getLocale(eGERMAN);
2092 }
2093
2094 const Locale & U_EXPORT2
getItalian(void)2095 Locale::getItalian(void)
2096 {
2097 return getLocale(eITALIAN);
2098 }
2099
2100 const Locale & U_EXPORT2
getJapanese(void)2101 Locale::getJapanese(void)
2102 {
2103 return getLocale(eJAPANESE);
2104 }
2105
2106 const Locale & U_EXPORT2
getKorean(void)2107 Locale::getKorean(void)
2108 {
2109 return getLocale(eKOREAN);
2110 }
2111
2112 const Locale & U_EXPORT2
getChinese(void)2113 Locale::getChinese(void)
2114 {
2115 return getLocale(eCHINESE);
2116 }
2117
2118 const Locale & U_EXPORT2
getSimplifiedChinese(void)2119 Locale::getSimplifiedChinese(void)
2120 {
2121 return getLocale(eCHINA);
2122 }
2123
2124 const Locale & U_EXPORT2
getTraditionalChinese(void)2125 Locale::getTraditionalChinese(void)
2126 {
2127 return getLocale(eTAIWAN);
2128 }
2129
2130
2131 const Locale & U_EXPORT2
getFrance(void)2132 Locale::getFrance(void)
2133 {
2134 return getLocale(eFRANCE);
2135 }
2136
2137 const Locale & U_EXPORT2
getGermany(void)2138 Locale::getGermany(void)
2139 {
2140 return getLocale(eGERMANY);
2141 }
2142
2143 const Locale & U_EXPORT2
getItaly(void)2144 Locale::getItaly(void)
2145 {
2146 return getLocale(eITALY);
2147 }
2148
2149 const Locale & U_EXPORT2
getJapan(void)2150 Locale::getJapan(void)
2151 {
2152 return getLocale(eJAPAN);
2153 }
2154
2155 const Locale & U_EXPORT2
getKorea(void)2156 Locale::getKorea(void)
2157 {
2158 return getLocale(eKOREA);
2159 }
2160
2161 const Locale & U_EXPORT2
getChina(void)2162 Locale::getChina(void)
2163 {
2164 return getLocale(eCHINA);
2165 }
2166
2167 const Locale & U_EXPORT2
getPRC(void)2168 Locale::getPRC(void)
2169 {
2170 return getLocale(eCHINA);
2171 }
2172
2173 const Locale & U_EXPORT2
getTaiwan(void)2174 Locale::getTaiwan(void)
2175 {
2176 return getLocale(eTAIWAN);
2177 }
2178
2179 const Locale & U_EXPORT2
getUK(void)2180 Locale::getUK(void)
2181 {
2182 return getLocale(eUK);
2183 }
2184
2185 const Locale & U_EXPORT2
getUS(void)2186 Locale::getUS(void)
2187 {
2188 return getLocale(eUS);
2189 }
2190
2191 const Locale & U_EXPORT2
getCanada(void)2192 Locale::getCanada(void)
2193 {
2194 return getLocale(eCANADA);
2195 }
2196
2197 const Locale & U_EXPORT2
getCanadaFrench(void)2198 Locale::getCanadaFrench(void)
2199 {
2200 return getLocale(eCANADA_FRENCH);
2201 }
2202
2203 const Locale &
getLocale(int locid)2204 Locale::getLocale(int locid)
2205 {
2206 Locale *localeCache = getLocaleCache();
2207 U_ASSERT((locid < eMAX_LOCALES)&&(locid>=0));
2208 if (localeCache == NULL) {
2209 // Failure allocating the locale cache.
2210 // The best we can do is return a NULL reference.
2211 locid = 0;
2212 }
2213 return localeCache[locid]; /*operating on NULL*/
2214 }
2215
2216 /*
2217 This function is defined this way in order to get around static
2218 initialization and static destruction.
2219 */
2220 Locale *
getLocaleCache(void)2221 Locale::getLocaleCache(void)
2222 {
2223 UErrorCode status = U_ZERO_ERROR;
2224 umtx_initOnce(gLocaleCacheInitOnce, locale_init, status);
2225 return gLocaleCache;
2226 }
2227
2228 class KeywordEnumeration : public StringEnumeration {
2229 private:
2230 char *keywords;
2231 char *current;
2232 int32_t length;
2233 UnicodeString currUSKey;
2234 static const char fgClassID;/* Warning this is used beyond the typical RTTI usage. */
2235
2236 public:
getStaticClassID(void)2237 static UClassID U_EXPORT2 getStaticClassID(void) { return (UClassID)&fgClassID; }
getDynamicClassID(void) const2238 virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); }
2239 public:
KeywordEnumeration(const char * keys,int32_t keywordLen,int32_t currentIndex,UErrorCode & status)2240 KeywordEnumeration(const char *keys, int32_t keywordLen, int32_t currentIndex, UErrorCode &status)
2241 : keywords((char *)&fgClassID), current((char *)&fgClassID), length(0) {
2242 if(U_SUCCESS(status) && keywordLen != 0) {
2243 if(keys == NULL || keywordLen < 0) {
2244 status = U_ILLEGAL_ARGUMENT_ERROR;
2245 } else {
2246 keywords = (char *)uprv_malloc(keywordLen+1);
2247 if (keywords == NULL) {
2248 status = U_MEMORY_ALLOCATION_ERROR;
2249 }
2250 else {
2251 uprv_memcpy(keywords, keys, keywordLen);
2252 keywords[keywordLen] = 0;
2253 current = keywords + currentIndex;
2254 length = keywordLen;
2255 }
2256 }
2257 }
2258 }
2259
2260 virtual ~KeywordEnumeration();
2261
clone() const2262 virtual StringEnumeration * clone() const
2263 {
2264 UErrorCode status = U_ZERO_ERROR;
2265 return new KeywordEnumeration(keywords, length, (int32_t)(current - keywords), status);
2266 }
2267
count(UErrorCode &) const2268 virtual int32_t count(UErrorCode &/*status*/) const {
2269 char *kw = keywords;
2270 int32_t result = 0;
2271 while(*kw) {
2272 result++;
2273 kw += uprv_strlen(kw)+1;
2274 }
2275 return result;
2276 }
2277
next(int32_t * resultLength,UErrorCode & status)2278 virtual const char* next(int32_t* resultLength, UErrorCode& status) {
2279 const char* result;
2280 int32_t len;
2281 if(U_SUCCESS(status) && *current != 0) {
2282 result = current;
2283 len = (int32_t)uprv_strlen(current);
2284 current += len+1;
2285 if(resultLength != NULL) {
2286 *resultLength = len;
2287 }
2288 } else {
2289 if(resultLength != NULL) {
2290 *resultLength = 0;
2291 }
2292 result = NULL;
2293 }
2294 return result;
2295 }
2296
snext(UErrorCode & status)2297 virtual const UnicodeString* snext(UErrorCode& status) {
2298 int32_t resultLength = 0;
2299 const char *s = next(&resultLength, status);
2300 return setChars(s, resultLength, status);
2301 }
2302
reset(UErrorCode &)2303 virtual void reset(UErrorCode& /*status*/) {
2304 current = keywords;
2305 }
2306 };
2307
2308 const char KeywordEnumeration::fgClassID = '\0';
2309
~KeywordEnumeration()2310 KeywordEnumeration::~KeywordEnumeration() {
2311 uprv_free(keywords);
2312 }
2313
2314 // A wrapper around KeywordEnumeration that calls uloc_toUnicodeLocaleKey() in
2315 // the next() method for each keyword before returning it.
2316 class UnicodeKeywordEnumeration : public KeywordEnumeration {
2317 public:
2318 using KeywordEnumeration::KeywordEnumeration;
2319 virtual ~UnicodeKeywordEnumeration();
2320
next(int32_t * resultLength,UErrorCode & status)2321 virtual const char* next(int32_t* resultLength, UErrorCode& status) {
2322 const char* legacy_key = KeywordEnumeration::next(nullptr, status);
2323 if (U_SUCCESS(status) && legacy_key != nullptr) {
2324 const char* key = uloc_toUnicodeLocaleKey(legacy_key);
2325 if (key == nullptr) {
2326 status = U_ILLEGAL_ARGUMENT_ERROR;
2327 } else {
2328 if (resultLength != nullptr) {
2329 *resultLength = static_cast<int32_t>(uprv_strlen(key));
2330 }
2331 return key;
2332 }
2333 }
2334 if (resultLength != nullptr) *resultLength = 0;
2335 return nullptr;
2336 }
2337 };
2338
2339 // Out-of-line virtual destructor to serve as the "key function".
2340 UnicodeKeywordEnumeration::~UnicodeKeywordEnumeration() = default;
2341
2342 StringEnumeration *
createKeywords(UErrorCode & status) const2343 Locale::createKeywords(UErrorCode &status) const
2344 {
2345 StringEnumeration *result = NULL;
2346
2347 if (U_FAILURE(status)) {
2348 return result;
2349 }
2350
2351 const char* variantStart = uprv_strchr(fullName, '@');
2352 const char* assignment = uprv_strchr(fullName, '=');
2353 if(variantStart) {
2354 if(assignment > variantStart) {
2355 CharString keywords;
2356 CharStringByteSink sink(&keywords);
2357 ulocimp_getKeywords(variantStart+1, '@', sink, FALSE, &status);
2358 if (U_SUCCESS(status) && !keywords.isEmpty()) {
2359 result = new KeywordEnumeration(keywords.data(), keywords.length(), 0, status);
2360 if (!result) {
2361 status = U_MEMORY_ALLOCATION_ERROR;
2362 }
2363 }
2364 } else {
2365 status = U_INVALID_FORMAT_ERROR;
2366 }
2367 }
2368 return result;
2369 }
2370
2371 StringEnumeration *
createUnicodeKeywords(UErrorCode & status) const2372 Locale::createUnicodeKeywords(UErrorCode &status) const
2373 {
2374 StringEnumeration *result = NULL;
2375
2376 if (U_FAILURE(status)) {
2377 return result;
2378 }
2379
2380 const char* variantStart = uprv_strchr(fullName, '@');
2381 const char* assignment = uprv_strchr(fullName, '=');
2382 if(variantStart) {
2383 if(assignment > variantStart) {
2384 CharString keywords;
2385 CharStringByteSink sink(&keywords);
2386 ulocimp_getKeywords(variantStart+1, '@', sink, FALSE, &status);
2387 if (U_SUCCESS(status) && !keywords.isEmpty()) {
2388 result = new UnicodeKeywordEnumeration(keywords.data(), keywords.length(), 0, status);
2389 if (!result) {
2390 status = U_MEMORY_ALLOCATION_ERROR;
2391 }
2392 }
2393 } else {
2394 status = U_INVALID_FORMAT_ERROR;
2395 }
2396 }
2397 return result;
2398 }
2399
2400 int32_t
getKeywordValue(const char * keywordName,char * buffer,int32_t bufLen,UErrorCode & status) const2401 Locale::getKeywordValue(const char* keywordName, char *buffer, int32_t bufLen, UErrorCode &status) const
2402 {
2403 return uloc_getKeywordValue(fullName, keywordName, buffer, bufLen, &status);
2404 }
2405
2406 void
getKeywordValue(StringPiece keywordName,ByteSink & sink,UErrorCode & status) const2407 Locale::getKeywordValue(StringPiece keywordName, ByteSink& sink, UErrorCode& status) const {
2408 if (U_FAILURE(status)) {
2409 return;
2410 }
2411
2412 if (fIsBogus) {
2413 status = U_ILLEGAL_ARGUMENT_ERROR;
2414 return;
2415 }
2416
2417 // TODO: Remove the need for a const char* to a NUL terminated buffer.
2418 const CharString keywordName_nul(keywordName, status);
2419 if (U_FAILURE(status)) {
2420 return;
2421 }
2422
2423 ulocimp_getKeywordValue(fullName, keywordName_nul.data(), sink, &status);
2424 }
2425
2426 void
getUnicodeKeywordValue(StringPiece keywordName,ByteSink & sink,UErrorCode & status) const2427 Locale::getUnicodeKeywordValue(StringPiece keywordName,
2428 ByteSink& sink,
2429 UErrorCode& status) const {
2430 // TODO: Remove the need for a const char* to a NUL terminated buffer.
2431 const CharString keywordName_nul(keywordName, status);
2432 if (U_FAILURE(status)) {
2433 return;
2434 }
2435
2436 const char* legacy_key = uloc_toLegacyKey(keywordName_nul.data());
2437
2438 if (legacy_key == nullptr) {
2439 status = U_ILLEGAL_ARGUMENT_ERROR;
2440 return;
2441 }
2442
2443 CharString legacy_value;
2444 {
2445 CharStringByteSink sink(&legacy_value);
2446 getKeywordValue(legacy_key, sink, status);
2447 }
2448
2449 if (U_FAILURE(status)) {
2450 return;
2451 }
2452
2453 const char* unicode_value = uloc_toUnicodeLocaleType(
2454 keywordName_nul.data(), legacy_value.data());
2455
2456 if (unicode_value == nullptr) {
2457 status = U_ILLEGAL_ARGUMENT_ERROR;
2458 return;
2459 }
2460
2461 sink.Append(unicode_value, static_cast<int32_t>(uprv_strlen(unicode_value)));
2462 }
2463
2464 void
setKeywordValue(const char * keywordName,const char * keywordValue,UErrorCode & status)2465 Locale::setKeywordValue(const char* keywordName, const char* keywordValue, UErrorCode &status)
2466 {
2467 if (U_FAILURE(status)) {
2468 return;
2469 }
2470 if (status == U_STRING_NOT_TERMINATED_WARNING) {
2471 status = U_ZERO_ERROR;
2472 }
2473 int32_t bufferLength = uprv_max((int32_t)(uprv_strlen(fullName) + 1), ULOC_FULLNAME_CAPACITY);
2474 int32_t newLength = uloc_setKeywordValue(keywordName, keywordValue, fullName,
2475 bufferLength, &status) + 1;
2476 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
2477 /* Handle the case the current buffer is not enough to hold the new id */
2478 if (status == U_BUFFER_OVERFLOW_ERROR) {
2479 U_ASSERT(newLength > bufferLength);
2480 char* newFullName = (char *)uprv_malloc(newLength);
2481 if (newFullName == nullptr) {
2482 status = U_MEMORY_ALLOCATION_ERROR;
2483 return;
2484 }
2485 uprv_strcpy(newFullName, fullName);
2486 if (fullName != fullNameBuffer) {
2487 // if full Name is already on the heap, need to free it.
2488 uprv_free(fullName);
2489 }
2490 fullName = newFullName;
2491 status = U_ZERO_ERROR;
2492 uloc_setKeywordValue(keywordName, keywordValue, fullName, newLength, &status);
2493 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
2494 } else {
2495 U_ASSERT(newLength <= bufferLength);
2496 }
2497 if (U_SUCCESS(status) && baseName == fullName) {
2498 // May have added the first keyword, meaning that the fullName is no longer also the baseName.
2499 initBaseName(status);
2500 }
2501 }
2502
2503 void
setKeywordValue(StringPiece keywordName,StringPiece keywordValue,UErrorCode & status)2504 Locale::setKeywordValue(StringPiece keywordName,
2505 StringPiece keywordValue,
2506 UErrorCode& status) {
2507 // TODO: Remove the need for a const char* to a NUL terminated buffer.
2508 const CharString keywordName_nul(keywordName, status);
2509 const CharString keywordValue_nul(keywordValue, status);
2510 setKeywordValue(keywordName_nul.data(), keywordValue_nul.data(), status);
2511 }
2512
2513 void
setUnicodeKeywordValue(StringPiece keywordName,StringPiece keywordValue,UErrorCode & status)2514 Locale::setUnicodeKeywordValue(StringPiece keywordName,
2515 StringPiece keywordValue,
2516 UErrorCode& status) {
2517 // TODO: Remove the need for a const char* to a NUL terminated buffer.
2518 const CharString keywordName_nul(keywordName, status);
2519 const CharString keywordValue_nul(keywordValue, status);
2520
2521 if (U_FAILURE(status)) {
2522 return;
2523 }
2524
2525 const char* legacy_key = uloc_toLegacyKey(keywordName_nul.data());
2526
2527 if (legacy_key == nullptr) {
2528 status = U_ILLEGAL_ARGUMENT_ERROR;
2529 return;
2530 }
2531
2532 const char* legacy_value = nullptr;
2533
2534 if (!keywordValue_nul.isEmpty()) {
2535 legacy_value =
2536 uloc_toLegacyType(keywordName_nul.data(), keywordValue_nul.data());
2537
2538 if (legacy_value == nullptr) {
2539 status = U_ILLEGAL_ARGUMENT_ERROR;
2540 return;
2541 }
2542 }
2543
2544 setKeywordValue(legacy_key, legacy_value, status);
2545 }
2546
2547 const char *
getBaseName() const2548 Locale::getBaseName() const {
2549 return baseName;
2550 }
2551
2552 Locale::Iterator::~Iterator() = default;
2553
2554 //eof
2555 U_NAMESPACE_END
2556