1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2009-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/bytestream.h"
11 #include "unicode/utypes.h"
12 #include "unicode/ures.h"
13 #include "unicode/localpointer.h"
14 #include "unicode/putil.h"
15 #include "unicode/uenum.h"
16 #include "unicode/uloc.h"
17 #include "ustr_imp.h"
18 #include "bytesinkutil.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "cstring.h"
22 #include "putilimp.h"
23 #include "uinvchar.h"
24 #include "ulocimp.h"
25 #include "uassert.h"
26
27
28 /* struct holding a single variant */
29 typedef struct VariantListEntry {
30 const char *variant;
31 struct VariantListEntry *next;
32 } VariantListEntry;
33
34 /* struct holding a single attribute value */
35 struct AttributeListEntry : public icu::UMemory {
36 const char *attribute;
37 struct AttributeListEntry *next;
38 };
39
40 /* struct holding a single extension */
41 struct ExtensionListEntry : public icu::UMemory {
42 const char *key;
43 const char *value;
44 struct ExtensionListEntry *next;
45 };
46
47 #define MAXEXTLANG 3
48 typedef struct ULanguageTag {
49 char *buf; /* holding parsed subtags */
50 const char *language;
51 const char *extlang[MAXEXTLANG];
52 const char *script;
53 const char *region;
54 VariantListEntry *variants;
55 ExtensionListEntry *extensions;
56 const char *privateuse;
57 const char *legacy;
58 } ULanguageTag;
59
60 #define MINLEN 2
61 #define SEP '-'
62 #define PRIVATEUSE 'x'
63 #define LDMLEXT 'u'
64
65 #define LOCALE_SEP '_'
66 #define LOCALE_EXT_SEP '@'
67 #define LOCALE_KEYWORD_SEP ';'
68 #define LOCALE_KEY_TYPE_SEP '='
69
70 #define ISALPHA(c) uprv_isASCIILetter(c)
71 #define ISNUMERIC(c) ((c)>='0' && (c)<='9')
72
73 static const char EMPTY[] = "";
74 static const char LANG_UND[] = "und";
75 static const char PRIVATEUSE_KEY[] = "x";
76 static const char _POSIX[] = "_POSIX";
77 static const char POSIX_KEY[] = "va";
78 static const char POSIX_VALUE[] = "posix";
79 static const char LOCALE_ATTRIBUTE_KEY[] = "attribute";
80 static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant";
81 static const char LOCALE_TYPE_YES[] = "yes";
82
83 #define LANG_UND_LEN 3
84
85 /*
86 Updated on 2018-09-12 from
87 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
88
89 This table has 2 parts. The part for
90 legacy language tags (marked as “Type: grandfathered” in BCP 47)
91 is generated by the following scripts from the IANA language tag registry.
92
93 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
94 egrep -A 7 'Type: grandfathered' | \
95 egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
96 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
97 tr 'A-Z' 'a-z'
98
99
100 The 2nd part is made of five ICU-specific entries. They're kept for
101 the backward compatibility for now, even though there are no preferred
102 values. They may have to be removed for the strict BCP 47 compliance.
103
104 */
105 static const char* const LEGACY[] = {
106 /* legacy preferred */
107 "art-lojban", "jbo",
108 "en-gb-oed", "en-gb-oxendict",
109 "i-ami", "ami",
110 "i-bnn", "bnn",
111 "i-hak", "hak",
112 "i-klingon", "tlh",
113 "i-lux", "lb",
114 "i-navajo", "nv",
115 "i-pwn", "pwn",
116 "i-tao", "tao",
117 "i-tay", "tay",
118 "i-tsu", "tsu",
119 "no-bok", "nb",
120 "no-nyn", "nn",
121 "sgn-be-fr", "sfb",
122 "sgn-be-nl", "vgt",
123 "sgn-ch-de", "sgg",
124 "zh-guoyu", "cmn",
125 "zh-hakka", "hak",
126 "zh-min-nan", "nan",
127 "zh-xiang", "hsn",
128
129 // Legacy tags with no preferred value in the IANA
130 // registry. Kept for now for the backward compatibility
131 // because ICU has mapped them this way.
132 "cel-gaulish", "xtg-x-cel-gaulish",
133 "i-default", "en-x-i-default",
134 "i-enochian", "und-x-i-enochian",
135 "i-mingo", "see-x-i-mingo",
136 "zh-min", "nan-x-zh-min",
137 };
138
139 /*
140 Updated on 2018-09-12 from
141 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
142
143 The table lists redundant tags with preferred value in the IANA languate tag registry.
144 It's generated with the following command:
145
146 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
147 grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
148 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
149 tr 'A-Z' 'a-z'
150
151 In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
152 a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
153 */
154
155 static const char* const REDUNDANT[] = {
156 // redundant preferred
157 "sgn-br", "bzs",
158 "sgn-co", "csn",
159 "sgn-de", "gsg",
160 "sgn-dk", "dsl",
161 "sgn-es", "ssp",
162 "sgn-fr", "fsl",
163 "sgn-gb", "bfi",
164 "sgn-gr", "gss",
165 "sgn-ie", "isg",
166 "sgn-it", "ise",
167 "sgn-jp", "jsl",
168 "sgn-mx", "mfs",
169 "sgn-ni", "ncs",
170 "sgn-nl", "dse",
171 "sgn-no", "nsl",
172 "sgn-pt", "psr",
173 "sgn-se", "swl",
174 "sgn-us", "ase",
175 "sgn-za", "sfs",
176 "zh-cmn", "cmn",
177 "zh-cmn-hans", "cmn-hans",
178 "zh-cmn-hant", "cmn-hant",
179 "zh-gan", "gan",
180 "zh-wuu", "wuu",
181 "zh-yue", "yue",
182
183 // variant tag with preferred value
184 "ja-latn-hepburn-heploc", "ja-latn-alalc97",
185 };
186
187 /*
188 Updated on 2018-09-12 from
189 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
190
191 grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
192 grep -B1 'Preferred' | grep -v '^--' | \
193 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
194
195 Make sure that 2-letter language subtags come before 3-letter subtags.
196 */
197 static const char DEPRECATEDLANGS[][4] = {
198 /* deprecated new */
199 "in", "id",
200 "iw", "he",
201 "ji", "yi",
202 "jw", "jv",
203 "mo", "ro",
204 "aam", "aas",
205 "adp", "dz",
206 "aue", "ktz",
207 "ayx", "nun",
208 "bgm", "bcg",
209 "bjd", "drl",
210 "ccq", "rki",
211 "cjr", "mom",
212 "cka", "cmr",
213 "cmk", "xch",
214 "coy", "pij",
215 "cqu", "quh",
216 "drh", "khk",
217 "drw", "prs",
218 "gav", "dev",
219 "gfx", "vaj",
220 "ggn", "gvr",
221 "gti", "nyc",
222 "guv", "duz",
223 "hrr", "jal",
224 "ibi", "opa",
225 "ilw", "gal",
226 "jeg", "oyb",
227 "kgc", "tdf",
228 "kgh", "kml",
229 "koj", "kwv",
230 "krm", "bmf",
231 "ktr", "dtp",
232 "kvs", "gdj",
233 "kwq", "yam",
234 "kxe", "tvd",
235 "kzj", "dtp",
236 "kzt", "dtp",
237 "lii", "raq",
238 "lmm", "rmx",
239 "meg", "cir",
240 "mst", "mry",
241 "mwj", "vaj",
242 "myt", "mry",
243 "nad", "xny",
244 "ncp", "kdz",
245 "nnx", "ngv",
246 "nts", "pij",
247 "oun", "vaj",
248 "pcr", "adx",
249 "pmc", "huw",
250 "pmu", "phr",
251 "ppa", "bfy",
252 "ppr", "lcq",
253 "pry", "prt",
254 "puz", "pub",
255 "sca", "hle",
256 "skk", "oyb",
257 "tdu", "dtp",
258 "thc", "tpo",
259 "thx", "oyb",
260 "tie", "ras",
261 "tkk", "twm",
262 "tlw", "weo",
263 "tmp", "tyj",
264 "tne", "kak",
265 "tnf", "prs",
266 "tsf", "taj",
267 "uok", "ema",
268 "xba", "cax",
269 "xia", "acn",
270 "xkh", "waw",
271 "xsj", "suj",
272 "ybd", "rki",
273 "yma", "lrr",
274 "ymt", "mtm",
275 "yos", "zom",
276 "yuu", "yug",
277 };
278
279 /*
280 Updated on 2018-04-24 from
281
282 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
283 grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
284 grep -B1 'Preferred' | \
285 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
286 */
287 static const char DEPRECATEDREGIONS[][3] = {
288 /* deprecated new */
289 "BU", "MM",
290 "DD", "DE",
291 "FX", "FR",
292 "TP", "TL",
293 "YD", "YE",
294 "ZR", "CD",
295 };
296
297 /*
298 * -------------------------------------------------
299 *
300 * These ultag_ functions may be exposed as APIs later
301 *
302 * -------------------------------------------------
303 */
304
305 static ULanguageTag*
306 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status);
307
308 static void
309 ultag_close(ULanguageTag* langtag);
310
311 static const char*
312 ultag_getLanguage(const ULanguageTag* langtag);
313
314 #if 0
315 static const char*
316 ultag_getJDKLanguage(const ULanguageTag* langtag);
317 #endif
318
319 static const char*
320 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx);
321
322 static int32_t
323 ultag_getExtlangSize(const ULanguageTag* langtag);
324
325 static const char*
326 ultag_getScript(const ULanguageTag* langtag);
327
328 static const char*
329 ultag_getRegion(const ULanguageTag* langtag);
330
331 static const char*
332 ultag_getVariant(const ULanguageTag* langtag, int32_t idx);
333
334 static int32_t
335 ultag_getVariantsSize(const ULanguageTag* langtag);
336
337 static const char*
338 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx);
339
340 static const char*
341 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx);
342
343 static int32_t
344 ultag_getExtensionsSize(const ULanguageTag* langtag);
345
346 static const char*
347 ultag_getPrivateUse(const ULanguageTag* langtag);
348
349 #if 0
350 static const char*
351 ultag_getLegacy(const ULanguageTag* langtag);
352 #endif
353
354 U_NAMESPACE_BEGIN
355
356 /**
357 * \class LocalULanguageTagPointer
358 * "Smart pointer" class, closes a ULanguageTag via ultag_close().
359 * For most methods see the LocalPointerBase base class.
360 *
361 * @see LocalPointerBase
362 * @see LocalPointer
363 * @internal
364 */
365 U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close);
366
367 U_NAMESPACE_END
368
369 /*
370 * -------------------------------------------------
371 *
372 * Language subtag syntax validation functions
373 *
374 * -------------------------------------------------
375 */
376
377 static UBool
_isAlphaString(const char * s,int32_t len)378 _isAlphaString(const char* s, int32_t len) {
379 int32_t i;
380 for (i = 0; i < len; i++) {
381 if (!ISALPHA(*(s + i))) {
382 return FALSE;
383 }
384 }
385 return TRUE;
386 }
387
388 static UBool
_isNumericString(const char * s,int32_t len)389 _isNumericString(const char* s, int32_t len) {
390 int32_t i;
391 for (i = 0; i < len; i++) {
392 if (!ISNUMERIC(*(s + i))) {
393 return FALSE;
394 }
395 }
396 return TRUE;
397 }
398
399 static UBool
_isAlphaNumericString(const char * s,int32_t len)400 _isAlphaNumericString(const char* s, int32_t len) {
401 int32_t i;
402 for (i = 0; i < len; i++) {
403 if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) {
404 return FALSE;
405 }
406 }
407 return TRUE;
408 }
409
410 static UBool
_isAlphaNumericStringLimitedLength(const char * s,int32_t len,int32_t min,int32_t max)411 _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) {
412 if (len < 0) {
413 len = (int32_t)uprv_strlen(s);
414 }
415 if (len >= min && len <= max && _isAlphaNumericString(s, len)) {
416 return TRUE;
417 }
418 return FALSE;
419 }
420
421 U_CFUNC UBool
ultag_isLanguageSubtag(const char * s,int32_t len)422 ultag_isLanguageSubtag(const char* s, int32_t len) {
423 /*
424 * unicode_language_subtag = alpha{2,3} | alpha{5,8};
425 * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
426 * See ICU-20372
427 */
428 if (len < 0) {
429 len = (int32_t)uprv_strlen(s);
430 }
431 if (len >= 2 && len <= 8 && _isAlphaString(s, len)) {
432 return TRUE;
433 }
434 return FALSE;
435 }
436
437 static UBool
_isExtlangSubtag(const char * s,int32_t len)438 _isExtlangSubtag(const char* s, int32_t len) {
439 /*
440 * extlang = 3ALPHA ; selected ISO 639 codes
441 * *2("-" 3ALPHA) ; permanently reserved
442 */
443 if (len < 0) {
444 len = (int32_t)uprv_strlen(s);
445 }
446 if (len == 3 && _isAlphaString(s, len)) {
447 return TRUE;
448 }
449 return FALSE;
450 }
451
452 U_CFUNC UBool
ultag_isScriptSubtag(const char * s,int32_t len)453 ultag_isScriptSubtag(const char* s, int32_t len) {
454 /*
455 * script = 4ALPHA ; ISO 15924 code
456 */
457 if (len < 0) {
458 len = (int32_t)uprv_strlen(s);
459 }
460 if (len == 4 && _isAlphaString(s, len)) {
461 return TRUE;
462 }
463 return FALSE;
464 }
465
466 U_CFUNC UBool
ultag_isRegionSubtag(const char * s,int32_t len)467 ultag_isRegionSubtag(const char* s, int32_t len) {
468 /*
469 * region = 2ALPHA ; ISO 3166-1 code
470 * / 3DIGIT ; UN M.49 code
471 */
472 if (len < 0) {
473 len = (int32_t)uprv_strlen(s);
474 }
475 if (len == 2 && _isAlphaString(s, len)) {
476 return TRUE;
477 }
478 if (len == 3 && _isNumericString(s, len)) {
479 return TRUE;
480 }
481 return FALSE;
482 }
483
484 static UBool
_isVariantSubtag(const char * s,int32_t len)485 _isVariantSubtag(const char* s, int32_t len) {
486 /*
487 * variant = 5*8alphanum ; registered variants
488 * / (DIGIT 3alphanum)
489 */
490 if (len < 0) {
491 len = (int32_t)uprv_strlen(s);
492 }
493 if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) {
494 return TRUE;
495 }
496 if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
497 return TRUE;
498 }
499 return FALSE;
500 }
501
502 static UBool
_isSepListOf(UBool (* test)(const char *,int32_t),const char * s,int32_t len)503 _isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) {
504 const char *p = s;
505 const char *pSubtag = NULL;
506
507 if (len < 0) {
508 len = (int32_t)uprv_strlen(s);
509 }
510
511 while ((p - s) < len) {
512 if (*p == SEP) {
513 if (pSubtag == NULL) {
514 return FALSE;
515 }
516 if (!test(pSubtag, (int32_t)(p - pSubtag))) {
517 return FALSE;
518 }
519 pSubtag = NULL;
520 } else if (pSubtag == NULL) {
521 pSubtag = p;
522 }
523 p++;
524 }
525 if (pSubtag == NULL) {
526 return FALSE;
527 }
528 return test(pSubtag, (int32_t)(p - pSubtag));
529 }
530
531 U_CFUNC UBool
ultag_isVariantSubtags(const char * s,int32_t len)532 ultag_isVariantSubtags(const char* s, int32_t len) {
533 return _isSepListOf(&_isVariantSubtag, s, len);
534 }
535
536 // This is for the ICU-specific "lvariant" handling.
537 static UBool
_isPrivateuseVariantSubtag(const char * s,int32_t len)538 _isPrivateuseVariantSubtag(const char* s, int32_t len) {
539 /*
540 * variant = 1*8alphanum ; registered variants
541 * / (DIGIT 3alphanum)
542 */
543 return _isAlphaNumericStringLimitedLength(s, len , 1, 8);
544 }
545
546 static UBool
_isExtensionSingleton(const char * s,int32_t len)547 _isExtensionSingleton(const char* s, int32_t len) {
548 /*
549 * extension = singleton 1*("-" (2*8alphanum))
550 *
551 * singleton = DIGIT ; 0 - 9
552 * / %x41-57 ; A - W
553 * / %x59-5A ; Y - Z
554 * / %x61-77 ; a - w
555 * / %x79-7A ; y - z
556 */
557 if (len < 0) {
558 len = (int32_t)uprv_strlen(s);
559 }
560 if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) {
561 return TRUE;
562 }
563 return FALSE;
564 }
565
566 static UBool
_isExtensionSubtag(const char * s,int32_t len)567 _isExtensionSubtag(const char* s, int32_t len) {
568 /*
569 * extension = singleton 1*("-" (2*8alphanum))
570 */
571 return _isAlphaNumericStringLimitedLength(s, len, 2, 8);
572 }
573
574 U_CFUNC UBool
ultag_isExtensionSubtags(const char * s,int32_t len)575 ultag_isExtensionSubtags(const char* s, int32_t len) {
576 return _isSepListOf(&_isExtensionSubtag, s, len);
577 }
578
579 static UBool
_isPrivateuseValueSubtag(const char * s,int32_t len)580 _isPrivateuseValueSubtag(const char* s, int32_t len) {
581 /*
582 * privateuse = "x" 1*("-" (1*8alphanum))
583 */
584 return _isAlphaNumericStringLimitedLength(s, len, 1, 8);
585 }
586
587 U_CFUNC UBool
ultag_isPrivateuseValueSubtags(const char * s,int32_t len)588 ultag_isPrivateuseValueSubtags(const char* s, int32_t len) {
589 return _isSepListOf(&_isPrivateuseValueSubtag, s, len);
590 }
591
592 U_CFUNC UBool
ultag_isUnicodeLocaleAttribute(const char * s,int32_t len)593 ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) {
594 /*
595 * attribute = alphanum{3,8} ;
596 */
597 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
598 }
599
600 U_CFUNC UBool
ultag_isUnicodeLocaleAttributes(const char * s,int32_t len)601 ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) {
602 return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len);
603 }
604
605 U_CFUNC UBool
ultag_isUnicodeLocaleKey(const char * s,int32_t len)606 ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
607 /*
608 * key = alphanum alpha ;
609 */
610 if (len < 0) {
611 len = (int32_t)uprv_strlen(s);
612 }
613 if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
614 return TRUE;
615 }
616 return FALSE;
617 }
618
619 U_CFUNC UBool
_isUnicodeLocaleTypeSubtag(const char * s,int32_t len)620 _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) {
621 /*
622 * alphanum{3,8}
623 */
624 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
625 }
626
627 U_CFUNC UBool
ultag_isUnicodeLocaleType(const char * s,int32_t len)628 ultag_isUnicodeLocaleType(const char*s, int32_t len) {
629 /*
630 * type = alphanum{3,8} (sep alphanum{3,8})* ;
631 */
632 return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len);
633 }
634
635 static UBool
_isTKey(const char * s,int32_t len)636 _isTKey(const char* s, int32_t len)
637 {
638 /*
639 * tkey = alpha digit ;
640 */
641 if (len < 0) {
642 len = (int32_t)uprv_strlen(s);
643 }
644 if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) {
645 return TRUE;
646 }
647 return FALSE;
648 }
649
650 static UBool
_isTValue(const char * s,int32_t len)651 _isTValue(const char* s, int32_t len)
652 {
653 /*
654 * tvalue = (sep alphanum{3,8})+ ;
655 */
656 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
657 }
658
659 static UBool
_isTransformedExtensionSubtag(int32_t & state,const char * s,int32_t len)660 _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
661 {
662 const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end
663 const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
664 // unicode_region_subtag, unicode_variant_subtag, tkey or end
665 const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag,
666 // unicode_variant_subtag, tkey, or end
667 const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag,
668 // tkey, or end.
669 const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag
670 // tkey or end.
671 const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here.
672 const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end
673
674 switch (state) {
675 case kStart:
676 if (ultag_isLanguageSubtag(s, len)) {
677 state = kGotLanguage;
678 return TRUE;
679 }
680 if (_isTKey(s, len)) {
681 state = kGotTKey;
682 return TRUE;
683 }
684 return FALSE;
685 case kGotLanguage:
686 if (ultag_isScriptSubtag(s, len)) {
687 state = kGotScript;
688 return TRUE;
689 }
690 U_FALLTHROUGH;
691 case kGotScript:
692 if (ultag_isRegionSubtag(s, len)) {
693 state = kGotRegion;
694 return TRUE;
695 }
696 U_FALLTHROUGH;
697 case kGotRegion:
698 U_FALLTHROUGH;
699 case kGotVariant:
700 if (_isVariantSubtag(s, len)) {
701 state = kGotVariant;
702 return TRUE;
703 }
704 if (_isTKey(s, len)) {
705 state = kGotTKey;
706 return TRUE;
707 }
708 return FALSE;
709 case kGotTKey:
710 if (_isTValue(s, len)) {
711 state = kGotTValue;
712 return TRUE;
713 }
714 return FALSE;
715 case kGotTValue:
716 if (_isTKey(s, len)) {
717 state = kGotTKey;
718 return TRUE;
719 }
720 if (_isTValue(s, len)) {
721 return TRUE;
722 }
723 return FALSE;
724 }
725 return FALSE;
726 }
727
728 static UBool
_isUnicodeExtensionSubtag(int32_t & state,const char * s,int32_t len)729 _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len)
730 {
731 const int32_t kStart = 0; // Start, wait for a key or attribute or end
732 const int32_t kGotKey = 1; // Got a key, wait for type or key or end
733 const int32_t kGotType = 2; // Got a type, wait for key or end
734
735 switch (state) {
736 case kStart:
737 if (ultag_isUnicodeLocaleKey(s, len)) {
738 state = kGotKey;
739 return TRUE;
740 }
741 if (ultag_isUnicodeLocaleAttribute(s, len)) {
742 return TRUE;
743 }
744 return FALSE;
745 case kGotKey:
746 if (ultag_isUnicodeLocaleKey(s, len)) {
747 return TRUE;
748 }
749 if (_isUnicodeLocaleTypeSubtag(s, len)) {
750 state = kGotType;
751 return TRUE;
752 }
753 return FALSE;
754 case kGotType:
755 if (ultag_isUnicodeLocaleKey(s, len)) {
756 state = kGotKey;
757 return TRUE;
758 }
759 if (_isUnicodeLocaleTypeSubtag(s, len)) {
760 return TRUE;
761 }
762 return FALSE;
763 }
764 return FALSE;
765 }
766
767 static UBool
_isStatefulSepListOf(UBool (* test)(int32_t &,const char *,int32_t),const char * s,int32_t len)768 _isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len)
769 {
770 int32_t state = 0;
771 const char* p;
772 const char* start = s;
773 int32_t subtagLen = 0;
774
775 if (len < 0) {
776 len = (int32_t)uprv_strlen(s);
777 }
778
779 for (p = s; len > 0; p++, len--) {
780 if (*p == SEP) {
781 if (!test(state, start, subtagLen)) {
782 return FALSE;
783 }
784 subtagLen = 0;
785 start = p + 1;
786 } else {
787 subtagLen++;
788 }
789 }
790
791 if (test(state, start, subtagLen) && state >= 0) {
792 return TRUE;
793 }
794 return FALSE;
795 }
796
797 U_CFUNC UBool
ultag_isTransformedExtensionSubtags(const char * s,int32_t len)798 ultag_isTransformedExtensionSubtags(const char* s, int32_t len)
799 {
800 return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len);
801 }
802
803 U_CFUNC UBool
ultag_isUnicodeExtensionSubtags(const char * s,int32_t len)804 ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) {
805 return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len);
806 }
807
808
809 /*
810 * -------------------------------------------------
811 *
812 * Helper functions
813 *
814 * -------------------------------------------------
815 */
816
817 static UBool
_addVariantToList(VariantListEntry ** first,VariantListEntry * var)818 _addVariantToList(VariantListEntry **first, VariantListEntry *var) {
819 UBool bAdded = TRUE;
820
821 if (*first == NULL) {
822 var->next = NULL;
823 *first = var;
824 } else {
825 VariantListEntry *prev, *cur;
826 int32_t cmp;
827
828 /* variants order should be preserved */
829 prev = NULL;
830 cur = *first;
831 while (TRUE) {
832 if (cur == NULL) {
833 prev->next = var;
834 var->next = NULL;
835 break;
836 }
837
838 /* Checking for duplicate variant */
839 cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant);
840 if (cmp == 0) {
841 /* duplicated variant */
842 bAdded = FALSE;
843 break;
844 }
845 prev = cur;
846 cur = cur->next;
847 }
848 }
849
850 return bAdded;
851 }
852
853 static UBool
_addAttributeToList(AttributeListEntry ** first,AttributeListEntry * attr)854 _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) {
855 UBool bAdded = TRUE;
856
857 if (*first == NULL) {
858 attr->next = NULL;
859 *first = attr;
860 } else {
861 AttributeListEntry *prev, *cur;
862 int32_t cmp;
863
864 /* reorder variants in alphabetical order */
865 prev = NULL;
866 cur = *first;
867 while (TRUE) {
868 if (cur == NULL) {
869 prev->next = attr;
870 attr->next = NULL;
871 break;
872 }
873 cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute);
874 if (cmp < 0) {
875 if (prev == NULL) {
876 *first = attr;
877 } else {
878 prev->next = attr;
879 }
880 attr->next = cur;
881 break;
882 }
883 if (cmp == 0) {
884 /* duplicated variant */
885 bAdded = FALSE;
886 break;
887 }
888 prev = cur;
889 cur = cur->next;
890 }
891 }
892
893 return bAdded;
894 }
895
896
897 static UBool
_addExtensionToList(ExtensionListEntry ** first,ExtensionListEntry * ext,UBool localeToBCP)898 _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) {
899 UBool bAdded = TRUE;
900
901 if (*first == NULL) {
902 ext->next = NULL;
903 *first = ext;
904 } else {
905 ExtensionListEntry *prev, *cur;
906 int32_t cmp;
907
908 /* reorder variants in alphabetical order */
909 prev = NULL;
910 cur = *first;
911 while (TRUE) {
912 if (cur == NULL) {
913 prev->next = ext;
914 ext->next = NULL;
915 break;
916 }
917 if (localeToBCP) {
918 /* special handling for locale to bcp conversion */
919 int32_t len, curlen;
920
921 len = (int32_t)uprv_strlen(ext->key);
922 curlen = (int32_t)uprv_strlen(cur->key);
923
924 if (len == 1 && curlen == 1) {
925 if (*(ext->key) == *(cur->key)) {
926 cmp = 0;
927 } else if (*(ext->key) == PRIVATEUSE) {
928 cmp = 1;
929 } else if (*(cur->key) == PRIVATEUSE) {
930 cmp = -1;
931 } else {
932 cmp = *(ext->key) - *(cur->key);
933 }
934 } else if (len == 1) {
935 cmp = *(ext->key) - LDMLEXT;
936 } else if (curlen == 1) {
937 cmp = LDMLEXT - *(cur->key);
938 } else {
939 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
940 /* Both are u extension keys - we need special handling for 'attribute' */
941 if (cmp != 0) {
942 if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) {
943 cmp = 1;
944 } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
945 cmp = -1;
946 }
947 }
948 }
949 } else {
950 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
951 }
952 if (cmp < 0) {
953 if (prev == NULL) {
954 *first = ext;
955 } else {
956 prev->next = ext;
957 }
958 ext->next = cur;
959 break;
960 }
961 if (cmp == 0) {
962 /* duplicated extension key */
963 bAdded = FALSE;
964 break;
965 }
966 prev = cur;
967 cur = cur->next;
968 }
969 }
970
971 return bAdded;
972 }
973
974 static void
_initializeULanguageTag(ULanguageTag * langtag)975 _initializeULanguageTag(ULanguageTag* langtag) {
976 int32_t i;
977
978 langtag->buf = NULL;
979
980 langtag->language = EMPTY;
981 for (i = 0; i < MAXEXTLANG; i++) {
982 langtag->extlang[i] = NULL;
983 }
984
985 langtag->script = EMPTY;
986 langtag->region = EMPTY;
987
988 langtag->variants = NULL;
989 langtag->extensions = NULL;
990
991 langtag->legacy = EMPTY;
992 langtag->privateuse = EMPTY;
993 }
994
995 static void
_appendLanguageToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)996 _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
997 char buf[ULOC_LANG_CAPACITY];
998 UErrorCode tmpStatus = U_ZERO_ERROR;
999 int32_t len, i;
1000
1001 if (U_FAILURE(*status)) {
1002 return;
1003 }
1004
1005 len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
1006 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1007 if (strict) {
1008 *status = U_ILLEGAL_ARGUMENT_ERROR;
1009 return;
1010 }
1011 len = 0;
1012 }
1013
1014 /* Note: returned language code is in lower case letters */
1015
1016 if (len == 0) {
1017 sink.Append(LANG_UND, LANG_UND_LEN);
1018 } else if (!ultag_isLanguageSubtag(buf, len)) {
1019 /* invalid language code */
1020 if (strict) {
1021 *status = U_ILLEGAL_ARGUMENT_ERROR;
1022 return;
1023 }
1024 sink.Append(LANG_UND, LANG_UND_LEN);
1025 } else {
1026 /* resolve deprecated */
1027 for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
1028 // 2-letter deprecated subtags are listede before 3-letter
1029 // ones in DEPRECATEDLANGS[]. Get out of loop on coming
1030 // across the 1st 3-letter subtag, if the input is a 2-letter code.
1031 // to avoid continuing to try when there's no match.
1032 if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
1033 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
1034 uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
1035 len = (int32_t)uprv_strlen(buf);
1036 break;
1037 }
1038 }
1039 sink.Append(buf, len);
1040 }
1041 }
1042
1043 static void
_appendScriptToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1044 _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1045 char buf[ULOC_SCRIPT_CAPACITY];
1046 UErrorCode tmpStatus = U_ZERO_ERROR;
1047 int32_t len;
1048
1049 if (U_FAILURE(*status)) {
1050 return;
1051 }
1052
1053 len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
1054 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1055 if (strict) {
1056 *status = U_ILLEGAL_ARGUMENT_ERROR;
1057 }
1058 return;
1059 }
1060
1061 if (len > 0) {
1062 if (!ultag_isScriptSubtag(buf, len)) {
1063 /* invalid script code */
1064 if (strict) {
1065 *status = U_ILLEGAL_ARGUMENT_ERROR;
1066 }
1067 return;
1068 } else {
1069 sink.Append("-", 1);
1070 sink.Append(buf, len);
1071 }
1072 }
1073 }
1074
1075 static void
_appendRegionToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1076 _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1077 char buf[ULOC_COUNTRY_CAPACITY];
1078 UErrorCode tmpStatus = U_ZERO_ERROR;
1079 int32_t len;
1080
1081 if (U_FAILURE(*status)) {
1082 return;
1083 }
1084
1085 len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
1086 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1087 if (strict) {
1088 *status = U_ILLEGAL_ARGUMENT_ERROR;
1089 }
1090 return;
1091 }
1092
1093 if (len > 0) {
1094 if (!ultag_isRegionSubtag(buf, len)) {
1095 /* invalid region code */
1096 if (strict) {
1097 *status = U_ILLEGAL_ARGUMENT_ERROR;
1098 }
1099 return;
1100 } else {
1101 sink.Append("-", 1);
1102 /* resolve deprecated */
1103 for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
1104 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
1105 uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
1106 len = (int32_t)uprv_strlen(buf);
1107 break;
1108 }
1109 }
1110 sink.Append(buf, len);
1111 }
1112 }
1113 }
1114
_sortVariants(VariantListEntry * first)1115 static void _sortVariants(VariantListEntry* first) {
1116 for (VariantListEntry* var1 = first; var1 != NULL; var1 = var1->next) {
1117 for (VariantListEntry* var2 = var1->next; var2 != NULL; var2 = var2->next) {
1118 // Swap var1->variant and var2->variant.
1119 if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) {
1120 const char* temp = var1->variant;
1121 var1->variant = var2->variant;
1122 var2->variant = temp;
1123 }
1124 }
1125 }
1126 }
1127
1128 static void
_appendVariantsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool * hadPosix,UErrorCode * status)1129 _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) {
1130 char buf[ULOC_FULLNAME_CAPACITY];
1131 UErrorCode tmpStatus = U_ZERO_ERROR;
1132 int32_t len, i;
1133
1134 if (U_FAILURE(*status)) {
1135 return;
1136 }
1137
1138 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1139 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1140 if (strict) {
1141 *status = U_ILLEGAL_ARGUMENT_ERROR;
1142 }
1143 return;
1144 }
1145
1146 if (len > 0) {
1147 char *p, *pVar;
1148 UBool bNext = TRUE;
1149 VariantListEntry *var;
1150 VariantListEntry *varFirst = NULL;
1151
1152 pVar = NULL;
1153 p = buf;
1154 while (bNext) {
1155 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1156 if (*p == 0) {
1157 bNext = FALSE;
1158 } else {
1159 *p = 0; /* terminate */
1160 }
1161 if (pVar == NULL) {
1162 if (strict) {
1163 *status = U_ILLEGAL_ARGUMENT_ERROR;
1164 break;
1165 }
1166 /* ignore empty variant */
1167 } else {
1168 /* ICU uses upper case letters for variants, but
1169 the canonical format is lowercase in BCP47 */
1170 for (i = 0; *(pVar + i) != 0; i++) {
1171 *(pVar + i) = uprv_tolower(*(pVar + i));
1172 }
1173
1174 /* validate */
1175 if (_isVariantSubtag(pVar, -1)) {
1176 if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) {
1177 /* emit the variant to the list */
1178 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
1179 if (var == NULL) {
1180 *status = U_MEMORY_ALLOCATION_ERROR;
1181 break;
1182 }
1183 var->variant = pVar;
1184 if (!_addVariantToList(&varFirst, var)) {
1185 /* duplicated variant */
1186 uprv_free(var);
1187 if (strict) {
1188 *status = U_ILLEGAL_ARGUMENT_ERROR;
1189 break;
1190 }
1191 }
1192 } else {
1193 /* Special handling for POSIX variant, need to remember that we had it and then */
1194 /* treat it like an extension later. */
1195 *hadPosix = TRUE;
1196 }
1197 } else if (strict) {
1198 *status = U_ILLEGAL_ARGUMENT_ERROR;
1199 break;
1200 } else if (_isPrivateuseValueSubtag(pVar, -1)) {
1201 /* Handle private use subtags separately */
1202 break;
1203 }
1204 }
1205 /* reset variant starting position */
1206 pVar = NULL;
1207 } else if (pVar == NULL) {
1208 pVar = p;
1209 }
1210 p++;
1211 }
1212
1213 if (U_SUCCESS(*status)) {
1214 if (varFirst != NULL) {
1215 int32_t varLen;
1216
1217 /* per UTS35, we should sort the variants */
1218 _sortVariants(varFirst);
1219
1220 /* write out validated/normalized variants to the target */
1221 var = varFirst;
1222 while (var != NULL) {
1223 sink.Append("-", 1);
1224 varLen = (int32_t)uprv_strlen(var->variant);
1225 sink.Append(var->variant, varLen);
1226 var = var->next;
1227 }
1228 }
1229 }
1230
1231 /* clean up */
1232 var = varFirst;
1233 while (var != NULL) {
1234 VariantListEntry *tmpVar = var->next;
1235 uprv_free(var);
1236 var = tmpVar;
1237 }
1238
1239 if (U_FAILURE(*status)) {
1240 return;
1241 }
1242 }
1243 }
1244
1245 static void
_appendKeywordsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1246 _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1247 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
1248 int32_t attrBufLength = 0;
1249
1250 icu::MemoryPool<AttributeListEntry> attrPool;
1251 icu::MemoryPool<ExtensionListEntry> extPool;
1252 icu::MemoryPool<icu::CharString> strPool;
1253
1254 icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status));
1255 if (U_FAILURE(*status) && !hadPosix) {
1256 return;
1257 }
1258 if (keywordEnum.isValid() || hadPosix) {
1259 /* reorder extensions */
1260 int32_t len;
1261 const char *key;
1262 ExtensionListEntry *firstExt = NULL;
1263 ExtensionListEntry *ext;
1264 AttributeListEntry *firstAttr = NULL;
1265 AttributeListEntry *attr;
1266 icu::MemoryPool<icu::CharString> extBufPool;
1267 const char *bcpKey=nullptr, *bcpValue=nullptr;
1268 UErrorCode tmpStatus = U_ZERO_ERROR;
1269 int32_t keylen;
1270 UBool isBcpUExt;
1271
1272 while (TRUE) {
1273 key = uenum_next(keywordEnum.getAlias(), NULL, status);
1274 if (key == NULL) {
1275 break;
1276 }
1277
1278 icu::CharString buf;
1279 {
1280 icu::CharStringByteSink sink(&buf);
1281 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
1282 }
1283 len = buf.length();
1284
1285 if (U_FAILURE(tmpStatus)) {
1286 if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
1287 *status = U_MEMORY_ALLOCATION_ERROR;
1288 break;
1289 }
1290 if (strict) {
1291 *status = U_ILLEGAL_ARGUMENT_ERROR;
1292 break;
1293 }
1294 /* ignore this keyword */
1295 tmpStatus = U_ZERO_ERROR;
1296 continue;
1297 }
1298
1299 keylen = (int32_t)uprv_strlen(key);
1300 isBcpUExt = (keylen > 1);
1301
1302 /* special keyword used for representing Unicode locale attributes */
1303 if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) {
1304 if (len > 0) {
1305 int32_t i = 0;
1306 while (TRUE) {
1307 attrBufLength = 0;
1308 for (; i < len; i++) {
1309 if (buf[i] != '-') {
1310 attrBuf[attrBufLength++] = buf[i];
1311 } else {
1312 i++;
1313 break;
1314 }
1315 }
1316 if (attrBufLength > 0) {
1317 attrBuf[attrBufLength] = 0;
1318
1319 } else if (i >= len){
1320 break;
1321 }
1322
1323 /* create AttributeListEntry */
1324 attr = attrPool.create();
1325 if (attr == NULL) {
1326 *status = U_MEMORY_ALLOCATION_ERROR;
1327 break;
1328 }
1329 icu::CharString* attrValue =
1330 strPool.create(attrBuf, attrBufLength, *status);
1331 if (attrValue == NULL) {
1332 *status = U_MEMORY_ALLOCATION_ERROR;
1333 break;
1334 }
1335 if (U_FAILURE(*status)) {
1336 break;
1337 }
1338 attr->attribute = attrValue->data();
1339
1340 if (!_addAttributeToList(&firstAttr, attr)) {
1341 if (strict) {
1342 *status = U_ILLEGAL_ARGUMENT_ERROR;
1343 break;
1344 }
1345 }
1346 }
1347 /* for a place holder ExtensionListEntry */
1348 bcpKey = LOCALE_ATTRIBUTE_KEY;
1349 bcpValue = NULL;
1350 }
1351 } else if (isBcpUExt) {
1352 bcpKey = uloc_toUnicodeLocaleKey(key);
1353 if (bcpKey == NULL) {
1354 if (strict) {
1355 *status = U_ILLEGAL_ARGUMENT_ERROR;
1356 break;
1357 }
1358 continue;
1359 }
1360
1361 /* we've checked buf is null-terminated above */
1362 bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
1363 if (bcpValue == NULL) {
1364 if (strict) {
1365 *status = U_ILLEGAL_ARGUMENT_ERROR;
1366 break;
1367 }
1368 continue;
1369 }
1370 if (bcpValue == buf.data()) {
1371 /*
1372 When uloc_toUnicodeLocaleType(key, buf) returns the
1373 input value as is, the value is well-formed, but has
1374 no known mapping. This implementation normalizes the
1375 value to lower case
1376 */
1377 icu::CharString* extBuf = extBufPool.create(buf, tmpStatus);
1378
1379 if (extBuf == nullptr) {
1380 *status = U_MEMORY_ALLOCATION_ERROR;
1381 break;
1382 }
1383 if (U_FAILURE(tmpStatus)) {
1384 *status = tmpStatus;
1385 break;
1386 }
1387
1388 T_CString_toLowerCase(extBuf->data());
1389 bcpValue = extBuf->data();
1390 }
1391 } else {
1392 if (*key == PRIVATEUSE) {
1393 if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) {
1394 if (strict) {
1395 *status = U_ILLEGAL_ARGUMENT_ERROR;
1396 break;
1397 }
1398 continue;
1399 }
1400 } else {
1401 if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) {
1402 if (strict) {
1403 *status = U_ILLEGAL_ARGUMENT_ERROR;
1404 break;
1405 }
1406 continue;
1407 }
1408 }
1409 bcpKey = key;
1410 icu::CharString* extBuf =
1411 extBufPool.create(buf.data(), len, tmpStatus);
1412 if (extBuf == nullptr) {
1413 *status = U_MEMORY_ALLOCATION_ERROR;
1414 break;
1415 }
1416 if (U_FAILURE(tmpStatus)) {
1417 *status = tmpStatus;
1418 break;
1419 }
1420 bcpValue = extBuf->data();
1421 }
1422
1423 /* create ExtensionListEntry */
1424 ext = extPool.create();
1425 if (ext == NULL) {
1426 *status = U_MEMORY_ALLOCATION_ERROR;
1427 break;
1428 }
1429 ext->key = bcpKey;
1430 ext->value = bcpValue;
1431
1432 if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1433 if (strict) {
1434 *status = U_ILLEGAL_ARGUMENT_ERROR;
1435 break;
1436 }
1437 }
1438 }
1439
1440 /* Special handling for POSIX variant - add the keywords for POSIX */
1441 if (hadPosix) {
1442 /* create ExtensionListEntry for POSIX */
1443 ext = extPool.create();
1444 if (ext == NULL) {
1445 *status = U_MEMORY_ALLOCATION_ERROR;
1446 return;
1447 }
1448 ext->key = POSIX_KEY;
1449 ext->value = POSIX_VALUE;
1450
1451 if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1452 // Silently ignore errors.
1453 }
1454 }
1455
1456 if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) {
1457 UBool startLDMLExtension = FALSE;
1458 for (ext = firstExt; ext; ext = ext->next) {
1459 if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
1460 /* first LDML u singlton extension */
1461 sink.Append("-u", 2);
1462 startLDMLExtension = TRUE;
1463 }
1464
1465 /* write out the sorted BCP47 attributes, extensions and private use */
1466 if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
1467 /* write the value for the attributes */
1468 for (attr = firstAttr; attr; attr = attr->next) {
1469 sink.Append("-", 1);
1470 sink.Append(
1471 attr->attribute, static_cast<int32_t>(uprv_strlen(attr->attribute)));
1472 }
1473 } else {
1474 sink.Append("-", 1);
1475 sink.Append(ext->key, static_cast<int32_t>(uprv_strlen(ext->key)));
1476 if (uprv_strcmp(ext->value, "true") != 0 &&
1477 uprv_strcmp(ext->value, "yes") != 0) {
1478 sink.Append("-", 1);
1479 sink.Append(ext->value, static_cast<int32_t>(uprv_strlen(ext->value)));
1480 }
1481 }
1482 }
1483 }
1484 }
1485 }
1486
1487 /**
1488 * Append keywords parsed from LDML extension value
1489 * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
1490 * Note: char* buf is used for storing keywords
1491 */
1492 static void
_appendLDMLExtensionAsKeywords(const char * ldmlext,ExtensionListEntry ** appendTo,icu::MemoryPool<ExtensionListEntry> & extPool,icu::MemoryPool<icu::CharString> & kwdBuf,UBool * posixVariant,UErrorCode * status)1493 _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool<ExtensionListEntry>& extPool, icu::MemoryPool<icu::CharString>& kwdBuf, UBool *posixVariant, UErrorCode *status) {
1494 const char *pTag; /* beginning of current subtag */
1495 const char *pKwds; /* beginning of key-type pairs */
1496 UBool variantExists = *posixVariant;
1497
1498 ExtensionListEntry *kwdFirst = NULL; /* first LDML keyword */
1499 ExtensionListEntry *kwd, *nextKwd;
1500
1501 int32_t len;
1502
1503 /* Reset the posixVariant value */
1504 *posixVariant = FALSE;
1505
1506 pTag = ldmlext;
1507 pKwds = NULL;
1508
1509 {
1510 AttributeListEntry *attrFirst = NULL; /* first attribute */
1511 AttributeListEntry *attr, *nextAttr;
1512
1513 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1514 int32_t attrBufIdx = 0;
1515
1516 icu::MemoryPool<AttributeListEntry> attrPool;
1517
1518 /* Iterate through u extension attributes */
1519 while (*pTag) {
1520 /* locate next separator char */
1521 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1522
1523 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1524 pKwds = pTag;
1525 break;
1526 }
1527
1528 /* add this attribute to the list */
1529 attr = attrPool.create();
1530 if (attr == NULL) {
1531 *status = U_MEMORY_ALLOCATION_ERROR;
1532 return;
1533 }
1534
1535 if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) {
1536 uprv_memcpy(&attrBuf[attrBufIdx], pTag, len);
1537 attrBuf[attrBufIdx + len] = 0;
1538 attr->attribute = &attrBuf[attrBufIdx];
1539 attrBufIdx += (len + 1);
1540 } else {
1541 *status = U_ILLEGAL_ARGUMENT_ERROR;
1542 return;
1543 }
1544
1545 // duplicate attribute is ignored, causes no error.
1546 _addAttributeToList(&attrFirst, attr);
1547
1548 /* next tag */
1549 pTag += len;
1550 if (*pTag) {
1551 /* next to the separator */
1552 pTag++;
1553 }
1554 }
1555
1556 if (attrFirst) {
1557 /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
1558
1559 kwd = extPool.create();
1560 if (kwd == NULL) {
1561 *status = U_MEMORY_ALLOCATION_ERROR;
1562 return;
1563 }
1564
1565 icu::CharString* value = kwdBuf.create();
1566 if (value == NULL) {
1567 *status = U_MEMORY_ALLOCATION_ERROR;
1568 return;
1569 }
1570
1571 /* attribute subtags sorted in alphabetical order as type */
1572 attr = attrFirst;
1573 while (attr != NULL) {
1574 nextAttr = attr->next;
1575 if (attr != attrFirst) {
1576 value->append('-', *status);
1577 }
1578 value->append(attr->attribute, *status);
1579 attr = nextAttr;
1580 }
1581 if (U_FAILURE(*status)) {
1582 return;
1583 }
1584
1585 kwd->key = LOCALE_ATTRIBUTE_KEY;
1586 kwd->value = value->data();
1587
1588 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1589 *status = U_ILLEGAL_ARGUMENT_ERROR;
1590 return;
1591 }
1592 }
1593 }
1594
1595 if (pKwds) {
1596 const char *pBcpKey = NULL; /* u extenstion key subtag */
1597 const char *pBcpType = NULL; /* beginning of u extension type subtag(s) */
1598 int32_t bcpKeyLen = 0;
1599 int32_t bcpTypeLen = 0;
1600 UBool isDone = FALSE;
1601
1602 pTag = pKwds;
1603 /* BCP47 representation of LDML key/type pairs */
1604 while (!isDone) {
1605 const char *pNextBcpKey = NULL;
1606 int32_t nextBcpKeyLen = 0;
1607 UBool emitKeyword = FALSE;
1608
1609 if (*pTag) {
1610 /* locate next separator char */
1611 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1612
1613 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1614 if (pBcpKey) {
1615 emitKeyword = TRUE;
1616 pNextBcpKey = pTag;
1617 nextBcpKeyLen = len;
1618 } else {
1619 pBcpKey = pTag;
1620 bcpKeyLen = len;
1621 }
1622 } else {
1623 U_ASSERT(pBcpKey != NULL);
1624 /* within LDML type subtags */
1625 if (pBcpType) {
1626 bcpTypeLen += (len + 1);
1627 } else {
1628 pBcpType = pTag;
1629 bcpTypeLen = len;
1630 }
1631 }
1632
1633 /* next tag */
1634 pTag += len;
1635 if (*pTag) {
1636 /* next to the separator */
1637 pTag++;
1638 }
1639 } else {
1640 /* processing last one */
1641 emitKeyword = TRUE;
1642 isDone = TRUE;
1643 }
1644
1645 if (emitKeyword) {
1646 const char *pKey = NULL; /* LDML key */
1647 const char *pType = NULL; /* LDML type */
1648
1649 char bcpKeyBuf[3]; /* BCP key length is always 2 for now */
1650
1651 U_ASSERT(pBcpKey != NULL);
1652
1653 if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) {
1654 /* the BCP key is invalid */
1655 *status = U_ILLEGAL_ARGUMENT_ERROR;
1656 return;
1657 }
1658 U_ASSERT(bcpKeyLen <= 2);
1659
1660 uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen);
1661 bcpKeyBuf[bcpKeyLen] = 0;
1662
1663 /* u extension key to LDML key */
1664 pKey = uloc_toLegacyKey(bcpKeyBuf);
1665 if (pKey == NULL) {
1666 *status = U_ILLEGAL_ARGUMENT_ERROR;
1667 return;
1668 }
1669 if (pKey == bcpKeyBuf) {
1670 /*
1671 The key returned by toLegacyKey points to the input buffer.
1672 We normalize the result key to lower case.
1673 */
1674 T_CString_toLowerCase(bcpKeyBuf);
1675 icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status);
1676 if (key == NULL) {
1677 *status = U_MEMORY_ALLOCATION_ERROR;
1678 return;
1679 }
1680 if (U_FAILURE(*status)) {
1681 return;
1682 }
1683 pKey = key->data();
1684 }
1685
1686 if (pBcpType) {
1687 char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */
1688 if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) {
1689 /* the BCP type is too long */
1690 *status = U_ILLEGAL_ARGUMENT_ERROR;
1691 return;
1692 }
1693
1694 uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen);
1695 bcpTypeBuf[bcpTypeLen] = 0;
1696
1697 /* BCP type to locale type */
1698 pType = uloc_toLegacyType(pKey, bcpTypeBuf);
1699 if (pType == NULL) {
1700 *status = U_ILLEGAL_ARGUMENT_ERROR;
1701 return;
1702 }
1703 if (pType == bcpTypeBuf) {
1704 /*
1705 The type returned by toLegacyType points to the input buffer.
1706 We normalize the result type to lower case.
1707 */
1708 /* normalize to lower case */
1709 T_CString_toLowerCase(bcpTypeBuf);
1710 icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status);
1711 if (type == NULL) {
1712 *status = U_MEMORY_ALLOCATION_ERROR;
1713 return;
1714 }
1715 if (U_FAILURE(*status)) {
1716 return;
1717 }
1718 pType = type->data();
1719 }
1720 } else {
1721 /* typeless - default type value is "yes" */
1722 pType = LOCALE_TYPE_YES;
1723 }
1724
1725 /* Special handling for u-va-posix, since we want to treat this as a variant,
1726 not as a keyword */
1727 if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) {
1728 *posixVariant = TRUE;
1729 } else {
1730 /* create an ExtensionListEntry for this keyword */
1731 kwd = extPool.create();
1732 if (kwd == NULL) {
1733 *status = U_MEMORY_ALLOCATION_ERROR;
1734 return;
1735 }
1736
1737 kwd->key = pKey;
1738 kwd->value = pType;
1739
1740 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1741 // duplicate keyword is allowed, Only the first
1742 // is honored.
1743 }
1744 }
1745
1746 pBcpKey = pNextBcpKey;
1747 bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0;
1748 pBcpType = NULL;
1749 bcpTypeLen = 0;
1750 }
1751 }
1752 }
1753
1754 kwd = kwdFirst;
1755 while (kwd != NULL) {
1756 nextKwd = kwd->next;
1757 _addExtensionToList(appendTo, kwd, FALSE);
1758 kwd = nextKwd;
1759 }
1760 }
1761
1762
1763 static void
_appendKeywords(ULanguageTag * langtag,icu::ByteSink & sink,UErrorCode * status)1764 _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) {
1765 int32_t i, n;
1766 int32_t len;
1767 ExtensionListEntry *kwdFirst = NULL;
1768 ExtensionListEntry *kwd;
1769 const char *key, *type;
1770 icu::MemoryPool<ExtensionListEntry> extPool;
1771 icu::MemoryPool<icu::CharString> kwdBuf;
1772 UBool posixVariant = FALSE;
1773
1774 if (U_FAILURE(*status)) {
1775 return;
1776 }
1777
1778 /* Determine if variants already exists */
1779 if (ultag_getVariantsSize(langtag)) {
1780 posixVariant = TRUE;
1781 }
1782
1783 n = ultag_getExtensionsSize(langtag);
1784
1785 /* resolve locale keywords and reordering keys */
1786 for (i = 0; i < n; i++) {
1787 key = ultag_getExtensionKey(langtag, i);
1788 type = ultag_getExtensionValue(langtag, i);
1789 if (*key == LDMLEXT) {
1790 _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
1791 if (U_FAILURE(*status)) {
1792 break;
1793 }
1794 } else {
1795 kwd = extPool.create();
1796 if (kwd == NULL) {
1797 *status = U_MEMORY_ALLOCATION_ERROR;
1798 break;
1799 }
1800 kwd->key = key;
1801 kwd->value = type;
1802 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1803 *status = U_ILLEGAL_ARGUMENT_ERROR;
1804 break;
1805 }
1806 }
1807 }
1808
1809 if (U_SUCCESS(*status)) {
1810 type = ultag_getPrivateUse(langtag);
1811 if ((int32_t)uprv_strlen(type) > 0) {
1812 /* add private use as a keyword */
1813 kwd = extPool.create();
1814 if (kwd == NULL) {
1815 *status = U_MEMORY_ALLOCATION_ERROR;
1816 } else {
1817 kwd->key = PRIVATEUSE_KEY;
1818 kwd->value = type;
1819 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1820 *status = U_ILLEGAL_ARGUMENT_ERROR;
1821 }
1822 }
1823 }
1824 }
1825
1826 /* If a POSIX variant was in the extensions, write it out before writing the keywords. */
1827
1828 if (U_SUCCESS(*status) && posixVariant) {
1829 len = (int32_t) uprv_strlen(_POSIX);
1830 sink.Append(_POSIX, len);
1831 }
1832
1833 if (U_SUCCESS(*status) && kwdFirst != NULL) {
1834 /* write out the sorted keywords */
1835 UBool firstValue = TRUE;
1836 kwd = kwdFirst;
1837 do {
1838 if (firstValue) {
1839 sink.Append("@", 1);
1840 firstValue = FALSE;
1841 } else {
1842 sink.Append(";", 1);
1843 }
1844
1845 /* key */
1846 len = (int32_t)uprv_strlen(kwd->key);
1847 sink.Append(kwd->key, len);
1848 sink.Append("=", 1);
1849
1850 /* type */
1851 len = (int32_t)uprv_strlen(kwd->value);
1852 sink.Append(kwd->value, len);
1853
1854 kwd = kwd->next;
1855 } while (kwd);
1856 }
1857 }
1858
1859 static void
_appendPrivateuseToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1860 _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1861 (void)hadPosix;
1862 char buf[ULOC_FULLNAME_CAPACITY];
1863 char tmpAppend[ULOC_FULLNAME_CAPACITY];
1864 UErrorCode tmpStatus = U_ZERO_ERROR;
1865 int32_t len, i;
1866 int32_t reslen = 0;
1867 int32_t capacity = sizeof tmpAppend;
1868
1869 if (U_FAILURE(*status)) {
1870 return;
1871 }
1872
1873 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1874 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1875 if (strict) {
1876 *status = U_ILLEGAL_ARGUMENT_ERROR;
1877 }
1878 return;
1879 }
1880
1881 if (len > 0) {
1882 char *p, *pPriv;
1883 UBool bNext = TRUE;
1884 UBool firstValue = TRUE;
1885 UBool writeValue;
1886
1887 pPriv = NULL;
1888 p = buf;
1889 while (bNext) {
1890 writeValue = FALSE;
1891 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1892 if (*p == 0) {
1893 bNext = FALSE;
1894 } else {
1895 *p = 0; /* terminate */
1896 }
1897 if (pPriv != NULL) {
1898 /* Private use in the canonical format is lowercase in BCP47 */
1899 for (i = 0; *(pPriv + i) != 0; i++) {
1900 *(pPriv + i) = uprv_tolower(*(pPriv + i));
1901 }
1902
1903 /* validate */
1904 if (_isPrivateuseValueSubtag(pPriv, -1)) {
1905 if (firstValue) {
1906 if (!_isVariantSubtag(pPriv, -1)) {
1907 writeValue = TRUE;
1908 }
1909 } else {
1910 writeValue = TRUE;
1911 }
1912 } else if (strict) {
1913 *status = U_ILLEGAL_ARGUMENT_ERROR;
1914 break;
1915 } else {
1916 break;
1917 }
1918
1919 if (writeValue) {
1920 if (reslen < capacity) {
1921 tmpAppend[reslen++] = SEP;
1922 }
1923
1924 if (firstValue) {
1925 if (reslen < capacity) {
1926 tmpAppend[reslen++] = *PRIVATEUSE_KEY;
1927 }
1928
1929 if (reslen < capacity) {
1930 tmpAppend[reslen++] = SEP;
1931 }
1932
1933 len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
1934 if (reslen < capacity) {
1935 uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
1936 }
1937 reslen += len;
1938
1939 if (reslen < capacity) {
1940 tmpAppend[reslen++] = SEP;
1941 }
1942
1943 firstValue = FALSE;
1944 }
1945
1946 len = (int32_t)uprv_strlen(pPriv);
1947 if (reslen < capacity) {
1948 uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
1949 }
1950 reslen += len;
1951 }
1952 }
1953 /* reset private use starting position */
1954 pPriv = NULL;
1955 } else if (pPriv == NULL) {
1956 pPriv = p;
1957 }
1958 p++;
1959 }
1960
1961 if (U_FAILURE(*status)) {
1962 return;
1963 }
1964 }
1965
1966 if (U_SUCCESS(*status)) {
1967 len = reslen;
1968 sink.Append(tmpAppend, len);
1969 }
1970 }
1971
1972 /*
1973 * -------------------------------------------------
1974 *
1975 * ultag_ functions
1976 *
1977 * -------------------------------------------------
1978 */
1979
1980 /* Bit flags used by the parser */
1981 #define LANG 0x0001
1982 #define EXTL 0x0002
1983 #define SCRT 0x0004
1984 #define REGN 0x0008
1985 #define VART 0x0010
1986 #define EXTS 0x0020
1987 #define EXTV 0x0040
1988 #define PRIV 0x0080
1989
1990 /**
1991 * Ticket #12705 - The optimizer in Visual Studio 2015 Update 3 has problems optimizing this function.
1992 * As a work-around, optimization is disabled for this function on VS2015 and VS2017.
1993 * This work-around should be removed once the following versions of Visual Studio are no
1994 * longer supported: All versions of VS2015/VS2017, and versions of VS2019 below 16.4.
1995 */
1996 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
1997 #pragma optimize( "", off )
1998 #endif
1999
2000 static ULanguageTag*
ultag_parse(const char * tag,int32_t tagLen,int32_t * parsedLen,UErrorCode * status)2001 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) {
2002 char *tagBuf;
2003 int16_t next;
2004 char *pSubtag, *pNext, *pLastGoodPosition;
2005 int32_t subtagLen;
2006 int32_t extlangIdx;
2007 ExtensionListEntry *pExtension;
2008 char *pExtValueSubtag, *pExtValueSubtagEnd;
2009 int32_t i;
2010 UBool privateuseVar = FALSE;
2011 int32_t legacyLen = 0;
2012
2013 if (parsedLen != NULL) {
2014 *parsedLen = 0;
2015 }
2016
2017 if (U_FAILURE(*status)) {
2018 return NULL;
2019 }
2020
2021 if (tagLen < 0) {
2022 tagLen = (int32_t)uprv_strlen(tag);
2023 }
2024
2025 /* copy the entire string */
2026 tagBuf = (char*)uprv_malloc(tagLen + 1);
2027 if (tagBuf == NULL) {
2028 *status = U_MEMORY_ALLOCATION_ERROR;
2029 return NULL;
2030 }
2031 uprv_memcpy(tagBuf, tag, tagLen);
2032 *(tagBuf + tagLen) = 0;
2033
2034 /* create a ULanguageTag */
2035 icu::LocalULanguageTagPointer t(
2036 (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)));
2037 if (t.isNull()) {
2038 uprv_free(tagBuf);
2039 *status = U_MEMORY_ALLOCATION_ERROR;
2040 return NULL;
2041 }
2042 _initializeULanguageTag(t.getAlias());
2043 t->buf = tagBuf;
2044
2045 if (tagLen < MINLEN) {
2046 /* the input tag is too short - return empty ULanguageTag */
2047 return t.orphan();
2048 }
2049
2050 size_t parsedLenDelta = 0;
2051 // Legacy tag will be consider together. Legacy tag with intervening
2052 // script and region such as art-DE-lojban or art-Latn-lojban won't be
2053 // matched.
2054 /* check if the tag is legacy */
2055 for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
2056 int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
2057 if (tagLen < checkLegacyLen) {
2058 continue;
2059 }
2060 if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
2061 // make sure next char is '-'.
2062 continue;
2063 }
2064 if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
2065 int32_t newTagLength;
2066
2067 legacyLen = checkLegacyLen; /* back up for output parsedLen */
2068 int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
2069 newTagLength = replacementLen + tagLen - checkLegacyLen;
2070 if (tagLen < newTagLength) {
2071 uprv_free(tagBuf);
2072 tagBuf = (char*)uprv_malloc(newTagLength + 1);
2073 if (tagBuf == NULL) {
2074 *status = U_MEMORY_ALLOCATION_ERROR;
2075 return NULL;
2076 }
2077 t->buf = tagBuf;
2078 tagLen = newTagLength;
2079 }
2080 parsedLenDelta = checkLegacyLen - replacementLen;
2081 uprv_strcpy(t->buf, LEGACY[i + 1]);
2082 if (checkLegacyLen != tagLen) {
2083 uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen);
2084 }
2085 break;
2086 }
2087 }
2088
2089 if (legacyLen == 0) {
2090 for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
2091 const char* redundantTag = REDUNDANT[i];
2092 size_t redundantTagLen = uprv_strlen(redundantTag);
2093 // The preferred tag for a redundant tag is always shorter than redundant
2094 // tag. A redundant tag may or may not be followed by other subtags.
2095 // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
2096 if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) {
2097 const char* redundantTagEnd = tagBuf + redundantTagLen;
2098 if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) {
2099 const char* preferredTag = REDUNDANT[i + 1];
2100 size_t preferredTagLen = uprv_strlen(preferredTag);
2101 uprv_strncpy(t->buf, preferredTag, preferredTagLen);
2102 if (*redundantTagEnd == SEP) {
2103 uprv_memmove(tagBuf + preferredTagLen,
2104 redundantTagEnd,
2105 tagLen - redundantTagLen + 1);
2106 } else {
2107 tagBuf[preferredTagLen] = '\0';
2108 }
2109 // parsedLen should be the length of the input
2110 // before redundantTag is replaced by preferredTag.
2111 // Save the delta to add it back later.
2112 parsedLenDelta = redundantTagLen - preferredTagLen;
2113 break;
2114 }
2115 }
2116 }
2117 }
2118
2119 /*
2120 * langtag = language
2121 * ["-" script]
2122 * ["-" region]
2123 * *("-" variant)
2124 * *("-" extension)
2125 * ["-" privateuse]
2126 */
2127
2128 next = LANG | PRIV;
2129 pNext = pLastGoodPosition = tagBuf;
2130 extlangIdx = 0;
2131 pExtension = NULL;
2132 pExtValueSubtag = NULL;
2133 pExtValueSubtagEnd = NULL;
2134
2135 while (pNext) {
2136 char *pSep;
2137
2138 pSubtag = pNext;
2139
2140 /* locate next separator char */
2141 pSep = pSubtag;
2142 while (*pSep) {
2143 if (*pSep == SEP) {
2144 break;
2145 }
2146 pSep++;
2147 }
2148 if (*pSep == 0) {
2149 /* last subtag */
2150 pNext = NULL;
2151 } else {
2152 pNext = pSep + 1;
2153 }
2154 subtagLen = (int32_t)(pSep - pSubtag);
2155
2156 if (next & LANG) {
2157 if (ultag_isLanguageSubtag(pSubtag, subtagLen)) {
2158 *pSep = 0; /* terminate */
2159 // TODO: move deprecated language code handling here.
2160 t->language = T_CString_toLowerCase(pSubtag);
2161
2162 pLastGoodPosition = pSep;
2163 next = SCRT | REGN | VART | EXTS | PRIV;
2164 if (subtagLen <= 3)
2165 next |= EXTL;
2166 continue;
2167 }
2168 }
2169 if (next & EXTL) {
2170 if (_isExtlangSubtag(pSubtag, subtagLen)) {
2171 *pSep = 0;
2172 t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag);
2173
2174 pLastGoodPosition = pSep;
2175 if (extlangIdx < 3) {
2176 next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
2177 } else {
2178 next = SCRT | REGN | VART | EXTS | PRIV;
2179 }
2180 continue;
2181 }
2182 }
2183 if (next & SCRT) {
2184 if (ultag_isScriptSubtag(pSubtag, subtagLen)) {
2185 char *p = pSubtag;
2186
2187 *pSep = 0;
2188
2189 /* to title case */
2190 *p = uprv_toupper(*p);
2191 p++;
2192 for (; *p; p++) {
2193 *p = uprv_tolower(*p);
2194 }
2195
2196 t->script = pSubtag;
2197
2198 pLastGoodPosition = pSep;
2199 next = REGN | VART | EXTS | PRIV;
2200 continue;
2201 }
2202 }
2203 if (next & REGN) {
2204 if (ultag_isRegionSubtag(pSubtag, subtagLen)) {
2205 *pSep = 0;
2206 // TODO: move deprecated region code handling here.
2207 t->region = T_CString_toUpperCase(pSubtag);
2208
2209 pLastGoodPosition = pSep;
2210 next = VART | EXTS | PRIV;
2211 continue;
2212 }
2213 }
2214 if (next & VART) {
2215 if (_isVariantSubtag(pSubtag, subtagLen) ||
2216 (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) {
2217 VariantListEntry *var;
2218 UBool isAdded;
2219
2220 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
2221 if (var == NULL) {
2222 *status = U_MEMORY_ALLOCATION_ERROR;
2223 return NULL;
2224 }
2225 *pSep = 0;
2226 var->variant = T_CString_toUpperCase(pSubtag);
2227 isAdded = _addVariantToList(&(t->variants), var);
2228 if (!isAdded) {
2229 /* duplicated variant entry */
2230 uprv_free(var);
2231 break;
2232 }
2233 pLastGoodPosition = pSep;
2234 next = VART | EXTS | PRIV;
2235 continue;
2236 }
2237 }
2238 if (next & EXTS) {
2239 if (_isExtensionSingleton(pSubtag, subtagLen)) {
2240 if (pExtension != NULL) {
2241 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2242 /* the previous extension is incomplete */
2243 uprv_free(pExtension);
2244 pExtension = NULL;
2245 break;
2246 }
2247
2248 /* terminate the previous extension value */
2249 *pExtValueSubtagEnd = 0;
2250 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2251
2252 /* insert the extension to the list */
2253 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2254 pLastGoodPosition = pExtValueSubtagEnd;
2255 } else {
2256 /* stop parsing here */
2257 uprv_free(pExtension);
2258 pExtension = NULL;
2259 break;
2260 }
2261 }
2262
2263 /* create a new extension */
2264 pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
2265 if (pExtension == NULL) {
2266 *status = U_MEMORY_ALLOCATION_ERROR;
2267 return NULL;
2268 }
2269 *pSep = 0;
2270 pExtension->key = T_CString_toLowerCase(pSubtag);
2271 pExtension->value = NULL; /* will be set later */
2272
2273 /*
2274 * reset the start and the end location of extension value
2275 * subtags for this extension
2276 */
2277 pExtValueSubtag = NULL;
2278 pExtValueSubtagEnd = NULL;
2279
2280 next = EXTV;
2281 continue;
2282 }
2283 }
2284 if (next & EXTV) {
2285 if (_isExtensionSubtag(pSubtag, subtagLen)) {
2286 if (pExtValueSubtag == NULL) {
2287 /* if the start postion of this extension's value is not yet,
2288 this one is the first value subtag */
2289 pExtValueSubtag = pSubtag;
2290 }
2291
2292 /* Mark the end of this subtag */
2293 pExtValueSubtagEnd = pSep;
2294 next = EXTS | EXTV | PRIV;
2295
2296 continue;
2297 }
2298 }
2299 if (next & PRIV) {
2300 if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
2301 char *pPrivuseVal;
2302
2303 if (pExtension != NULL) {
2304 /* Process the last extension */
2305 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2306 /* the previous extension is incomplete */
2307 uprv_free(pExtension);
2308 pExtension = NULL;
2309 break;
2310 } else {
2311 /* terminate the previous extension value */
2312 *pExtValueSubtagEnd = 0;
2313 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2314
2315 /* insert the extension to the list */
2316 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2317 pLastGoodPosition = pExtValueSubtagEnd;
2318 pExtension = NULL;
2319 } else {
2320 /* stop parsing here */
2321 uprv_free(pExtension);
2322 pExtension = NULL;
2323 break;
2324 }
2325 }
2326 }
2327
2328 /* The rest of part will be private use value subtags */
2329 if (pNext == NULL) {
2330 /* empty private use subtag */
2331 break;
2332 }
2333 /* back up the private use value start position */
2334 pPrivuseVal = pNext;
2335
2336 /* validate private use value subtags */
2337 while (pNext) {
2338 pSubtag = pNext;
2339 pSep = pSubtag;
2340 while (*pSep) {
2341 if (*pSep == SEP) {
2342 break;
2343 }
2344 pSep++;
2345 }
2346 if (*pSep == 0) {
2347 /* last subtag */
2348 pNext = NULL;
2349 } else {
2350 pNext = pSep + 1;
2351 }
2352 subtagLen = (int32_t)(pSep - pSubtag);
2353
2354 if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) {
2355 *pSep = 0;
2356 next = VART;
2357 privateuseVar = TRUE;
2358 break;
2359 } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) {
2360 pLastGoodPosition = pSep;
2361 } else {
2362 break;
2363 }
2364 }
2365
2366 if (next == VART) {
2367 continue;
2368 }
2369
2370 if (pLastGoodPosition - pPrivuseVal > 0) {
2371 *pLastGoodPosition = 0;
2372 t->privateuse = T_CString_toLowerCase(pPrivuseVal);
2373 }
2374 /* No more subtags, exiting the parse loop */
2375 break;
2376 }
2377 break;
2378 }
2379
2380 /* If we fell through here, it means this subtag is illegal - quit parsing */
2381 break;
2382 }
2383
2384 if (pExtension != NULL) {
2385 /* Process the last extension */
2386 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2387 /* the previous extension is incomplete */
2388 uprv_free(pExtension);
2389 } else {
2390 /* terminate the previous extension value */
2391 *pExtValueSubtagEnd = 0;
2392 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2393 /* insert the extension to the list */
2394 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2395 pLastGoodPosition = pExtValueSubtagEnd;
2396 } else {
2397 uprv_free(pExtension);
2398 }
2399 }
2400 }
2401
2402 if (parsedLen != NULL) {
2403 *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
2404 }
2405
2406 return t.orphan();
2407 }
2408
2409 // Ticket #12705 - Turn optimization back on.
2410 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2411 #pragma optimize( "", on )
2412 #endif
2413
2414 static void
ultag_close(ULanguageTag * langtag)2415 ultag_close(ULanguageTag* langtag) {
2416
2417 if (langtag == NULL) {
2418 return;
2419 }
2420
2421 uprv_free(langtag->buf);
2422
2423 if (langtag->variants) {
2424 VariantListEntry *curVar = langtag->variants;
2425 while (curVar) {
2426 VariantListEntry *nextVar = curVar->next;
2427 uprv_free(curVar);
2428 curVar = nextVar;
2429 }
2430 }
2431
2432 if (langtag->extensions) {
2433 ExtensionListEntry *curExt = langtag->extensions;
2434 while (curExt) {
2435 ExtensionListEntry *nextExt = curExt->next;
2436 uprv_free(curExt);
2437 curExt = nextExt;
2438 }
2439 }
2440
2441 uprv_free(langtag);
2442 }
2443
2444 static const char*
ultag_getLanguage(const ULanguageTag * langtag)2445 ultag_getLanguage(const ULanguageTag* langtag) {
2446 return langtag->language;
2447 }
2448
2449 #if 0
2450 static const char*
2451 ultag_getJDKLanguage(const ULanguageTag* langtag) {
2452 int32_t i;
2453 for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) {
2454 if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) {
2455 return DEPRECATEDLANGS[i + 1];
2456 }
2457 }
2458 return langtag->language;
2459 }
2460 #endif
2461
2462 static const char*
ultag_getExtlang(const ULanguageTag * langtag,int32_t idx)2463 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) {
2464 if (idx >= 0 && idx < MAXEXTLANG) {
2465 return langtag->extlang[idx];
2466 }
2467 return NULL;
2468 }
2469
2470 static int32_t
ultag_getExtlangSize(const ULanguageTag * langtag)2471 ultag_getExtlangSize(const ULanguageTag* langtag) {
2472 int32_t size = 0;
2473 int32_t i;
2474 for (i = 0; i < MAXEXTLANG; i++) {
2475 if (langtag->extlang[i]) {
2476 size++;
2477 }
2478 }
2479 return size;
2480 }
2481
2482 static const char*
ultag_getScript(const ULanguageTag * langtag)2483 ultag_getScript(const ULanguageTag* langtag) {
2484 return langtag->script;
2485 }
2486
2487 static const char*
ultag_getRegion(const ULanguageTag * langtag)2488 ultag_getRegion(const ULanguageTag* langtag) {
2489 return langtag->region;
2490 }
2491
2492 static const char*
ultag_getVariant(const ULanguageTag * langtag,int32_t idx)2493 ultag_getVariant(const ULanguageTag* langtag, int32_t idx) {
2494 const char *var = NULL;
2495 VariantListEntry *cur = langtag->variants;
2496 int32_t i = 0;
2497 while (cur) {
2498 if (i == idx) {
2499 var = cur->variant;
2500 break;
2501 }
2502 cur = cur->next;
2503 i++;
2504 }
2505 return var;
2506 }
2507
2508 static int32_t
ultag_getVariantsSize(const ULanguageTag * langtag)2509 ultag_getVariantsSize(const ULanguageTag* langtag) {
2510 int32_t size = 0;
2511 VariantListEntry *cur = langtag->variants;
2512 while (TRUE) {
2513 if (cur == NULL) {
2514 break;
2515 }
2516 size++;
2517 cur = cur->next;
2518 }
2519 return size;
2520 }
2521
2522 static const char*
ultag_getExtensionKey(const ULanguageTag * langtag,int32_t idx)2523 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) {
2524 const char *key = NULL;
2525 ExtensionListEntry *cur = langtag->extensions;
2526 int32_t i = 0;
2527 while (cur) {
2528 if (i == idx) {
2529 key = cur->key;
2530 break;
2531 }
2532 cur = cur->next;
2533 i++;
2534 }
2535 return key;
2536 }
2537
2538 static const char*
ultag_getExtensionValue(const ULanguageTag * langtag,int32_t idx)2539 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) {
2540 const char *val = NULL;
2541 ExtensionListEntry *cur = langtag->extensions;
2542 int32_t i = 0;
2543 while (cur) {
2544 if (i == idx) {
2545 val = cur->value;
2546 break;
2547 }
2548 cur = cur->next;
2549 i++;
2550 }
2551 return val;
2552 }
2553
2554 static int32_t
ultag_getExtensionsSize(const ULanguageTag * langtag)2555 ultag_getExtensionsSize(const ULanguageTag* langtag) {
2556 int32_t size = 0;
2557 ExtensionListEntry *cur = langtag->extensions;
2558 while (TRUE) {
2559 if (cur == NULL) {
2560 break;
2561 }
2562 size++;
2563 cur = cur->next;
2564 }
2565 return size;
2566 }
2567
2568 static const char*
ultag_getPrivateUse(const ULanguageTag * langtag)2569 ultag_getPrivateUse(const ULanguageTag* langtag) {
2570 return langtag->privateuse;
2571 }
2572
2573 #if 0
2574 static const char*
2575 ultag_getLegacy(const ULanguageTag* langtag) {
2576 return langtag->legacy;
2577 }
2578 #endif
2579
2580
2581 /*
2582 * -------------------------------------------------
2583 *
2584 * Locale/BCP47 conversion APIs, exposed as uloc_*
2585 *
2586 * -------------------------------------------------
2587 */
2588 U_CAPI int32_t U_EXPORT2
uloc_toLanguageTag(const char * localeID,char * langtag,int32_t langtagCapacity,UBool strict,UErrorCode * status)2589 uloc_toLanguageTag(const char* localeID,
2590 char* langtag,
2591 int32_t langtagCapacity,
2592 UBool strict,
2593 UErrorCode* status) {
2594 if (U_FAILURE(*status)) {
2595 return 0;
2596 }
2597
2598 icu::CheckedArrayByteSink sink(langtag, langtagCapacity);
2599 ulocimp_toLanguageTag(localeID, sink, strict, status);
2600
2601 int32_t reslen = sink.NumberOfBytesAppended();
2602
2603 if (U_FAILURE(*status)) {
2604 return reslen;
2605 }
2606
2607 if (sink.Overflowed()) {
2608 *status = U_BUFFER_OVERFLOW_ERROR;
2609 } else {
2610 u_terminateChars(langtag, langtagCapacity, reslen, status);
2611 }
2612
2613 return reslen;
2614 }
2615
2616
2617 U_CAPI void U_EXPORT2
ulocimp_toLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)2618 ulocimp_toLanguageTag(const char* localeID,
2619 icu::ByteSink& sink,
2620 UBool strict,
2621 UErrorCode* status) {
2622 icu::CharString canonical;
2623 int32_t reslen;
2624 UErrorCode tmpStatus = U_ZERO_ERROR;
2625 UBool hadPosix = FALSE;
2626 const char* pKeywordStart;
2627
2628 /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
2629 int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
2630 if (resultCapacity > 0) {
2631 char* buffer;
2632
2633 for (;;) {
2634 buffer = canonical.getAppendBuffer(
2635 /*minCapacity=*/resultCapacity,
2636 /*desiredCapacityHint=*/resultCapacity,
2637 resultCapacity,
2638 tmpStatus);
2639
2640 if (U_FAILURE(tmpStatus)) {
2641 *status = tmpStatus;
2642 return;
2643 }
2644
2645 reslen =
2646 uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
2647
2648 if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
2649 break;
2650 }
2651
2652 resultCapacity = reslen;
2653 tmpStatus = U_ZERO_ERROR;
2654 }
2655
2656 if (U_FAILURE(tmpStatus)) {
2657 *status = U_ILLEGAL_ARGUMENT_ERROR;
2658 return;
2659 }
2660
2661 canonical.append(buffer, reslen, tmpStatus);
2662 if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
2663 tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
2664 }
2665
2666 if (U_FAILURE(tmpStatus)) {
2667 *status = tmpStatus;
2668 return;
2669 }
2670 }
2671
2672 /* For handling special case - private use only tag */
2673 pKeywordStart = locale_getKeywordsStart(canonical.data());
2674 if (pKeywordStart == canonical.data()) {
2675 int kwdCnt = 0;
2676 UBool done = FALSE;
2677
2678 icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus));
2679 if (U_SUCCESS(tmpStatus)) {
2680 kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus);
2681 if (kwdCnt == 1) {
2682 const char *key;
2683 int32_t len = 0;
2684
2685 key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus);
2686 if (len == 1 && *key == PRIVATEUSE) {
2687 icu::CharString buf;
2688 {
2689 icu::CharStringByteSink sink(&buf);
2690 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
2691 }
2692 if (U_SUCCESS(tmpStatus)) {
2693 if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) {
2694 /* return private use only tag */
2695 static const char PREFIX[] = { PRIVATEUSE, SEP };
2696 sink.Append(PREFIX, sizeof(PREFIX));
2697 sink.Append(buf.data(), buf.length());
2698 done = TRUE;
2699 } else if (strict) {
2700 *status = U_ILLEGAL_ARGUMENT_ERROR;
2701 done = TRUE;
2702 }
2703 /* if not strict mode, then "und" will be returned */
2704 } else {
2705 *status = U_ILLEGAL_ARGUMENT_ERROR;
2706 done = TRUE;
2707 }
2708 }
2709 }
2710 if (done) {
2711 return;
2712 }
2713 }
2714 }
2715
2716 _appendLanguageToLanguageTag(canonical.data(), sink, strict, status);
2717 _appendScriptToLanguageTag(canonical.data(), sink, strict, status);
2718 _appendRegionToLanguageTag(canonical.data(), sink, strict, status);
2719 _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status);
2720 _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2721 _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2722 }
2723
2724
2725 U_CAPI int32_t U_EXPORT2
uloc_forLanguageTag(const char * langtag,char * localeID,int32_t localeIDCapacity,int32_t * parsedLength,UErrorCode * status)2726 uloc_forLanguageTag(const char* langtag,
2727 char* localeID,
2728 int32_t localeIDCapacity,
2729 int32_t* parsedLength,
2730 UErrorCode* status) {
2731 if (U_FAILURE(*status)) {
2732 return 0;
2733 }
2734
2735 icu::CheckedArrayByteSink sink(localeID, localeIDCapacity);
2736 ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status);
2737
2738 int32_t reslen = sink.NumberOfBytesAppended();
2739
2740 if (U_FAILURE(*status)) {
2741 return reslen;
2742 }
2743
2744 if (sink.Overflowed()) {
2745 *status = U_BUFFER_OVERFLOW_ERROR;
2746 } else {
2747 u_terminateChars(localeID, localeIDCapacity, reslen, status);
2748 }
2749
2750 return reslen;
2751 }
2752
2753
2754 U_CAPI void U_EXPORT2
ulocimp_forLanguageTag(const char * langtag,int32_t tagLen,icu::ByteSink & sink,int32_t * parsedLength,UErrorCode * status)2755 ulocimp_forLanguageTag(const char* langtag,
2756 int32_t tagLen,
2757 icu::ByteSink& sink,
2758 int32_t* parsedLength,
2759 UErrorCode* status) {
2760 UBool isEmpty = TRUE;
2761 const char *subtag, *p;
2762 int32_t len;
2763 int32_t i, n;
2764 UBool noRegion = TRUE;
2765
2766 icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status));
2767 if (U_FAILURE(*status)) {
2768 return;
2769 }
2770
2771 /* language */
2772 subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias());
2773 if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) {
2774 len = (int32_t)uprv_strlen(subtag);
2775 if (len > 0) {
2776 sink.Append(subtag, len);
2777 isEmpty = FALSE;
2778 }
2779 }
2780
2781 /* script */
2782 subtag = ultag_getScript(lt.getAlias());
2783 len = (int32_t)uprv_strlen(subtag);
2784 if (len > 0) {
2785 sink.Append("_", 1);
2786 isEmpty = FALSE;
2787
2788 /* write out the script in title case */
2789 char c = uprv_toupper(*subtag);
2790 sink.Append(&c, 1);
2791 sink.Append(subtag + 1, len - 1);
2792 }
2793
2794 /* region */
2795 subtag = ultag_getRegion(lt.getAlias());
2796 len = (int32_t)uprv_strlen(subtag);
2797 if (len > 0) {
2798 sink.Append("_", 1);
2799 isEmpty = FALSE;
2800
2801 /* write out the region in upper case */
2802 p = subtag;
2803 while (*p) {
2804 char c = uprv_toupper(*p);
2805 sink.Append(&c, 1);
2806 p++;
2807 }
2808 noRegion = FALSE;
2809 }
2810
2811 /* variants */
2812 _sortVariants(lt.getAlias()->variants);
2813 n = ultag_getVariantsSize(lt.getAlias());
2814 if (n > 0) {
2815 if (noRegion) {
2816 sink.Append("_", 1);
2817 isEmpty = FALSE;
2818 }
2819
2820 for (i = 0; i < n; i++) {
2821 subtag = ultag_getVariant(lt.getAlias(), i);
2822 sink.Append("_", 1);
2823
2824 /* write out the variant in upper case */
2825 p = subtag;
2826 while (*p) {
2827 char c = uprv_toupper(*p);
2828 sink.Append(&c, 1);
2829 p++;
2830 }
2831 }
2832 }
2833
2834 /* keywords */
2835 n = ultag_getExtensionsSize(lt.getAlias());
2836 subtag = ultag_getPrivateUse(lt.getAlias());
2837 if (n > 0 || uprv_strlen(subtag) > 0) {
2838 if (isEmpty && n > 0) {
2839 /* need a language */
2840 sink.Append(LANG_UND, LANG_UND_LEN);
2841 }
2842 _appendKeywords(lt.getAlias(), sink, status);
2843 }
2844 }
2845