• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2009-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 
10 #include "unicode/bytestream.h"
11 #include "unicode/utypes.h"
12 #include "unicode/ures.h"
13 #include "unicode/localpointer.h"
14 #include "unicode/putil.h"
15 #include "unicode/uenum.h"
16 #include "unicode/uloc.h"
17 #include "ustr_imp.h"
18 #include "bytesinkutil.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "cstring.h"
22 #include "putilimp.h"
23 #include "uinvchar.h"
24 #include "ulocimp.h"
25 #include "uassert.h"
26 
27 
28 /* struct holding a single variant */
29 typedef struct VariantListEntry {
30     const char              *variant;
31     struct VariantListEntry *next;
32 } VariantListEntry;
33 
34 /* struct holding a single attribute value */
35 struct AttributeListEntry : public icu::UMemory {
36     const char              *attribute;
37     struct AttributeListEntry *next;
38 };
39 
40 /* struct holding a single extension */
41 struct ExtensionListEntry : public icu::UMemory {
42     const char                  *key;
43     const char                  *value;
44     struct ExtensionListEntry   *next;
45 };
46 
47 #define MAXEXTLANG 3
48 typedef struct ULanguageTag {
49     char                *buf;   /* holding parsed subtags */
50     const char          *language;
51     const char          *extlang[MAXEXTLANG];
52     const char          *script;
53     const char          *region;
54     VariantListEntry    *variants;
55     ExtensionListEntry  *extensions;
56     const char          *privateuse;
57     const char          *legacy;
58 } ULanguageTag;
59 
60 #define MINLEN 2
61 #define SEP '-'
62 #define PRIVATEUSE 'x'
63 #define LDMLEXT 'u'
64 
65 #define LOCALE_SEP '_'
66 #define LOCALE_EXT_SEP '@'
67 #define LOCALE_KEYWORD_SEP ';'
68 #define LOCALE_KEY_TYPE_SEP '='
69 
70 #define ISALPHA(c) uprv_isASCIILetter(c)
71 #define ISNUMERIC(c) ((c)>='0' && (c)<='9')
72 
73 static const char EMPTY[] = "";
74 static const char LANG_UND[] = "und";
75 static const char PRIVATEUSE_KEY[] = "x";
76 static const char _POSIX[] = "_POSIX";
77 static const char POSIX_KEY[] = "va";
78 static const char POSIX_VALUE[] = "posix";
79 static const char LOCALE_ATTRIBUTE_KEY[] = "attribute";
80 static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant";
81 static const char LOCALE_TYPE_YES[] = "yes";
82 
83 #define LANG_UND_LEN 3
84 
85 /*
86  Updated on 2018-09-12 from
87  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
88 
89  This table has 2 parts. The part for
90  legacy language tags (marked as “Type: grandfathered” in BCP 47)
91  is generated by the following scripts from the IANA language tag registry.
92 
93  curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
94  egrep -A 7 'Type: grandfathered' | \
95  egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
96  awk -n '/Tag/ {printf("    \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
97  tr 'A-Z' 'a-z'
98 
99 
100  The 2nd part is made of five ICU-specific entries. They're kept for
101  the backward compatibility for now, even though there are no preferred
102  values. They may have to be removed for the strict BCP 47 compliance.
103 
104 */
105 static const char* const LEGACY[] = {
106 /*  legacy          preferred */
107     "art-lojban",   "jbo",
108     "en-gb-oed",    "en-gb-oxendict",
109     "i-ami",        "ami",
110     "i-bnn",        "bnn",
111     "i-hak",        "hak",
112     "i-klingon",    "tlh",
113     "i-lux",        "lb",
114     "i-navajo",     "nv",
115     "i-pwn",        "pwn",
116     "i-tao",        "tao",
117     "i-tay",        "tay",
118     "i-tsu",        "tsu",
119     "no-bok",       "nb",
120     "no-nyn",       "nn",
121     "sgn-be-fr",    "sfb",
122     "sgn-be-nl",    "vgt",
123     "sgn-ch-de",    "sgg",
124     "zh-guoyu",     "cmn",
125     "zh-hakka",     "hak",
126     "zh-min-nan",   "nan",
127     "zh-xiang",     "hsn",
128 
129     // Legacy tags with no preferred value in the IANA
130     // registry. Kept for now for the backward compatibility
131     // because ICU has mapped them this way.
132     "cel-gaulish",  "xtg-x-cel-gaulish",
133     "i-default",    "en-x-i-default",
134     "i-enochian",   "und-x-i-enochian",
135     "i-mingo",      "see-x-i-mingo",
136     "zh-min",       "nan-x-zh-min",
137 };
138 
139 /*
140  Updated on 2018-09-12 from
141  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
142 
143  The table lists redundant tags with preferred value in the IANA languate tag registry.
144  It's generated with the following command:
145 
146  curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
147  grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
148  awk -n '/Tag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
149  tr 'A-Z' 'a-z'
150 
151  In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
152  a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
153 */
154 
155 static const char* const REDUNDANT[] = {
156 //  redundant       preferred
157     "sgn-br",       "bzs",
158     "sgn-co",       "csn",
159     "sgn-de",       "gsg",
160     "sgn-dk",       "dsl",
161     "sgn-es",       "ssp",
162     "sgn-fr",       "fsl",
163     "sgn-gb",       "bfi",
164     "sgn-gr",       "gss",
165     "sgn-ie",       "isg",
166     "sgn-it",       "ise",
167     "sgn-jp",       "jsl",
168     "sgn-mx",       "mfs",
169     "sgn-ni",       "ncs",
170     "sgn-nl",       "dse",
171     "sgn-no",       "nsl",
172     "sgn-pt",       "psr",
173     "sgn-se",       "swl",
174     "sgn-us",       "ase",
175     "sgn-za",       "sfs",
176     "zh-cmn",       "cmn",
177     "zh-cmn-hans",  "cmn-hans",
178     "zh-cmn-hant",  "cmn-hant",
179     "zh-gan",       "gan",
180     "zh-wuu",       "wuu",
181     "zh-yue",       "yue",
182 
183     // variant tag with preferred value
184     "ja-latn-hepburn-heploc", "ja-latn-alalc97",
185 };
186 
187 /*
188   Updated on 2018-09-12 from
189   https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
190 
191   grep 'Type: language' -A 7 language-subtag-registry  | egrep 'Subtag|Prefe' | \
192   grep -B1 'Preferred' | grep -v '^--' | \
193   awk -n '/Subtag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
194 
195   Make sure that 2-letter language subtags come before 3-letter subtags.
196 */
197 static const char DEPRECATEDLANGS[][4] = {
198 /*  deprecated  new */
199     "in",       "id",
200     "iw",       "he",
201     "ji",       "yi",
202     "jw",       "jv",
203     "mo",       "ro",
204     "aam",       "aas",
205     "adp",       "dz",
206     "aue",       "ktz",
207     "ayx",       "nun",
208     "bgm",       "bcg",
209     "bjd",       "drl",
210     "ccq",       "rki",
211     "cjr",       "mom",
212     "cka",       "cmr",
213     "cmk",       "xch",
214     "coy",       "pij",
215     "cqu",       "quh",
216     "drh",       "khk",
217     "drw",       "prs",
218     "gav",       "dev",
219     "gfx",       "vaj",
220     "ggn",       "gvr",
221     "gti",       "nyc",
222     "guv",       "duz",
223     "hrr",       "jal",
224     "ibi",       "opa",
225     "ilw",       "gal",
226     "jeg",       "oyb",
227     "kgc",       "tdf",
228     "kgh",       "kml",
229     "koj",       "kwv",
230     "krm",       "bmf",
231     "ktr",       "dtp",
232     "kvs",       "gdj",
233     "kwq",       "yam",
234     "kxe",       "tvd",
235     "kzj",       "dtp",
236     "kzt",       "dtp",
237     "lii",       "raq",
238     "lmm",       "rmx",
239     "meg",       "cir",
240     "mst",       "mry",
241     "mwj",       "vaj",
242     "myt",       "mry",
243     "nad",       "xny",
244     "ncp",       "kdz",
245     "nnx",       "ngv",
246     "nts",       "pij",
247     "oun",       "vaj",
248     "pcr",       "adx",
249     "pmc",       "huw",
250     "pmu",       "phr",
251     "ppa",       "bfy",
252     "ppr",       "lcq",
253     "pry",       "prt",
254     "puz",       "pub",
255     "sca",       "hle",
256     "skk",       "oyb",
257     "tdu",       "dtp",
258     "thc",       "tpo",
259     "thx",       "oyb",
260     "tie",       "ras",
261     "tkk",       "twm",
262     "tlw",       "weo",
263     "tmp",       "tyj",
264     "tne",       "kak",
265     "tnf",       "prs",
266     "tsf",       "taj",
267     "uok",       "ema",
268     "xba",       "cax",
269     "xia",       "acn",
270     "xkh",       "waw",
271     "xsj",       "suj",
272     "ybd",       "rki",
273     "yma",       "lrr",
274     "ymt",       "mtm",
275     "yos",       "zom",
276     "yuu",       "yug",
277 };
278 
279 /*
280   Updated on 2018-04-24 from
281 
282   curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
283   grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
284   grep -B1 'Preferred' | \
285   awk -n '/Subtag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
286 */
287 static const char DEPRECATEDREGIONS[][3] = {
288 /*  deprecated  new */
289     "BU",       "MM",
290     "DD",       "DE",
291     "FX",       "FR",
292     "TP",       "TL",
293     "YD",       "YE",
294     "ZR",       "CD",
295 };
296 
297 /*
298 * -------------------------------------------------
299 *
300 * These ultag_ functions may be exposed as APIs later
301 *
302 * -------------------------------------------------
303 */
304 
305 static ULanguageTag*
306 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status);
307 
308 static void
309 ultag_close(ULanguageTag* langtag);
310 
311 static const char*
312 ultag_getLanguage(const ULanguageTag* langtag);
313 
314 #if 0
315 static const char*
316 ultag_getJDKLanguage(const ULanguageTag* langtag);
317 #endif
318 
319 static const char*
320 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx);
321 
322 static int32_t
323 ultag_getExtlangSize(const ULanguageTag* langtag);
324 
325 static const char*
326 ultag_getScript(const ULanguageTag* langtag);
327 
328 static const char*
329 ultag_getRegion(const ULanguageTag* langtag);
330 
331 static const char*
332 ultag_getVariant(const ULanguageTag* langtag, int32_t idx);
333 
334 static int32_t
335 ultag_getVariantsSize(const ULanguageTag* langtag);
336 
337 static const char*
338 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx);
339 
340 static const char*
341 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx);
342 
343 static int32_t
344 ultag_getExtensionsSize(const ULanguageTag* langtag);
345 
346 static const char*
347 ultag_getPrivateUse(const ULanguageTag* langtag);
348 
349 #if 0
350 static const char*
351 ultag_getLegacy(const ULanguageTag* langtag);
352 #endif
353 
354 U_NAMESPACE_BEGIN
355 
356 /**
357  * \class LocalULanguageTagPointer
358  * "Smart pointer" class, closes a ULanguageTag via ultag_close().
359  * For most methods see the LocalPointerBase base class.
360  *
361  * @see LocalPointerBase
362  * @see LocalPointer
363  * @internal
364  */
365 U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close);
366 
367 U_NAMESPACE_END
368 
369 /*
370 * -------------------------------------------------
371 *
372 * Language subtag syntax validation functions
373 *
374 * -------------------------------------------------
375 */
376 
377 static UBool
_isAlphaString(const char * s,int32_t len)378 _isAlphaString(const char* s, int32_t len) {
379     int32_t i;
380     for (i = 0; i < len; i++) {
381         if (!ISALPHA(*(s + i))) {
382             return FALSE;
383         }
384     }
385     return TRUE;
386 }
387 
388 static UBool
_isNumericString(const char * s,int32_t len)389 _isNumericString(const char* s, int32_t len) {
390     int32_t i;
391     for (i = 0; i < len; i++) {
392         if (!ISNUMERIC(*(s + i))) {
393             return FALSE;
394         }
395     }
396     return TRUE;
397 }
398 
399 static UBool
_isAlphaNumericString(const char * s,int32_t len)400 _isAlphaNumericString(const char* s, int32_t len) {
401     int32_t i;
402     for (i = 0; i < len; i++) {
403         if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) {
404             return FALSE;
405         }
406     }
407     return TRUE;
408 }
409 
410 static UBool
_isAlphaNumericStringLimitedLength(const char * s,int32_t len,int32_t min,int32_t max)411 _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) {
412     if (len < 0) {
413         len = (int32_t)uprv_strlen(s);
414     }
415     if (len >= min && len <= max && _isAlphaNumericString(s, len)) {
416         return TRUE;
417     }
418     return FALSE;
419 }
420 
421 U_CFUNC UBool
ultag_isLanguageSubtag(const char * s,int32_t len)422 ultag_isLanguageSubtag(const char* s, int32_t len) {
423     /*
424      * unicode_language_subtag = alpha{2,3} | alpha{5,8};
425      * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
426      * See ICU-20372
427      */
428     if (len < 0) {
429         len = (int32_t)uprv_strlen(s);
430     }
431     if (len >= 2 && len <= 8 && _isAlphaString(s, len)) {
432         return TRUE;
433     }
434     return FALSE;
435 }
436 
437 static UBool
_isExtlangSubtag(const char * s,int32_t len)438 _isExtlangSubtag(const char* s, int32_t len) {
439     /*
440      * extlang       = 3ALPHA              ; selected ISO 639 codes
441      *                 *2("-" 3ALPHA)      ; permanently reserved
442      */
443     if (len < 0) {
444         len = (int32_t)uprv_strlen(s);
445     }
446     if (len == 3 && _isAlphaString(s, len)) {
447         return TRUE;
448     }
449     return FALSE;
450 }
451 
452 U_CFUNC UBool
ultag_isScriptSubtag(const char * s,int32_t len)453 ultag_isScriptSubtag(const char* s, int32_t len) {
454     /*
455      * script        = 4ALPHA              ; ISO 15924 code
456      */
457     if (len < 0) {
458         len = (int32_t)uprv_strlen(s);
459     }
460     if (len == 4 && _isAlphaString(s, len)) {
461         return TRUE;
462     }
463     return FALSE;
464 }
465 
466 U_CFUNC UBool
ultag_isRegionSubtag(const char * s,int32_t len)467 ultag_isRegionSubtag(const char* s, int32_t len) {
468     /*
469      * region        = 2ALPHA              ; ISO 3166-1 code
470      *               / 3DIGIT              ; UN M.49 code
471      */
472     if (len < 0) {
473         len = (int32_t)uprv_strlen(s);
474     }
475     if (len == 2 && _isAlphaString(s, len)) {
476         return TRUE;
477     }
478     if (len == 3 && _isNumericString(s, len)) {
479         return TRUE;
480     }
481     return FALSE;
482 }
483 
484 static UBool
_isVariantSubtag(const char * s,int32_t len)485 _isVariantSubtag(const char* s, int32_t len) {
486     /*
487      * variant       = 5*8alphanum         ; registered variants
488      *               / (DIGIT 3alphanum)
489      */
490     if (len < 0) {
491         len = (int32_t)uprv_strlen(s);
492     }
493     if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) {
494         return TRUE;
495     }
496     if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
497         return TRUE;
498     }
499     return FALSE;
500 }
501 
502 static UBool
_isSepListOf(UBool (* test)(const char *,int32_t),const char * s,int32_t len)503 _isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) {
504     const char *p = s;
505     const char *pSubtag = NULL;
506 
507     if (len < 0) {
508         len = (int32_t)uprv_strlen(s);
509     }
510 
511     while ((p - s) < len) {
512         if (*p == SEP) {
513             if (pSubtag == NULL) {
514                 return FALSE;
515             }
516             if (!test(pSubtag, (int32_t)(p - pSubtag))) {
517                 return FALSE;
518             }
519             pSubtag = NULL;
520         } else if (pSubtag == NULL) {
521             pSubtag = p;
522         }
523         p++;
524     }
525     if (pSubtag == NULL) {
526         return FALSE;
527     }
528     return test(pSubtag, (int32_t)(p - pSubtag));
529 }
530 
531 U_CFUNC UBool
ultag_isVariantSubtags(const char * s,int32_t len)532 ultag_isVariantSubtags(const char* s, int32_t len) {
533     return _isSepListOf(&_isVariantSubtag, s, len);
534 }
535 
536 // This is for the ICU-specific "lvariant" handling.
537 static UBool
_isPrivateuseVariantSubtag(const char * s,int32_t len)538 _isPrivateuseVariantSubtag(const char* s, int32_t len) {
539     /*
540      * variant       = 1*8alphanum         ; registered variants
541      *               / (DIGIT 3alphanum)
542      */
543     return _isAlphaNumericStringLimitedLength(s, len , 1, 8);
544 }
545 
546 static UBool
_isExtensionSingleton(const char * s,int32_t len)547 _isExtensionSingleton(const char* s, int32_t len) {
548     /*
549      * extension     = singleton 1*("-" (2*8alphanum))
550      *
551      * singleton     = DIGIT               ; 0 - 9
552      *               / %x41-57             ; A - W
553      *               / %x59-5A             ; Y - Z
554      *               / %x61-77             ; a - w
555      *               / %x79-7A             ; y - z
556      */
557     if (len < 0) {
558         len = (int32_t)uprv_strlen(s);
559     }
560     if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) {
561         return TRUE;
562     }
563     return FALSE;
564 }
565 
566 static UBool
_isExtensionSubtag(const char * s,int32_t len)567 _isExtensionSubtag(const char* s, int32_t len) {
568     /*
569      * extension     = singleton 1*("-" (2*8alphanum))
570      */
571     return _isAlphaNumericStringLimitedLength(s, len, 2, 8);
572 }
573 
574 U_CFUNC UBool
ultag_isExtensionSubtags(const char * s,int32_t len)575 ultag_isExtensionSubtags(const char* s, int32_t len) {
576     return _isSepListOf(&_isExtensionSubtag, s, len);
577 }
578 
579 static UBool
_isPrivateuseValueSubtag(const char * s,int32_t len)580 _isPrivateuseValueSubtag(const char* s, int32_t len) {
581     /*
582      * privateuse    = "x" 1*("-" (1*8alphanum))
583      */
584     return _isAlphaNumericStringLimitedLength(s, len, 1, 8);
585 }
586 
587 U_CFUNC UBool
ultag_isPrivateuseValueSubtags(const char * s,int32_t len)588 ultag_isPrivateuseValueSubtags(const char* s, int32_t len) {
589     return _isSepListOf(&_isPrivateuseValueSubtag, s, len);
590 }
591 
592 U_CFUNC UBool
ultag_isUnicodeLocaleAttribute(const char * s,int32_t len)593 ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) {
594     /*
595      * attribute = alphanum{3,8} ;
596      */
597     return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
598 }
599 
600 U_CFUNC UBool
ultag_isUnicodeLocaleAttributes(const char * s,int32_t len)601 ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) {
602     return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len);
603 }
604 
605 U_CFUNC UBool
ultag_isUnicodeLocaleKey(const char * s,int32_t len)606 ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
607     /*
608      * key = alphanum alpha ;
609      */
610     if (len < 0) {
611         len = (int32_t)uprv_strlen(s);
612     }
613     if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
614         return TRUE;
615     }
616     return FALSE;
617 }
618 
619 U_CFUNC UBool
_isUnicodeLocaleTypeSubtag(const char * s,int32_t len)620 _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) {
621     /*
622      * alphanum{3,8}
623      */
624     return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
625 }
626 
627 U_CFUNC UBool
ultag_isUnicodeLocaleType(const char * s,int32_t len)628 ultag_isUnicodeLocaleType(const char*s, int32_t len) {
629     /*
630      * type = alphanum{3,8} (sep alphanum{3,8})* ;
631      */
632     return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len);
633 }
634 
635 static UBool
_isTKey(const char * s,int32_t len)636 _isTKey(const char* s, int32_t len)
637 {
638     /*
639      * tkey = alpha digit ;
640      */
641     if (len < 0) {
642         len = (int32_t)uprv_strlen(s);
643     }
644     if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) {
645         return TRUE;
646     }
647     return FALSE;
648 }
649 
650 static UBool
_isTValue(const char * s,int32_t len)651 _isTValue(const char* s, int32_t len)
652 {
653     /*
654      * tvalue = (sep alphanum{3,8})+ ;
655      */
656     return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
657 }
658 
659 static UBool
_isTransformedExtensionSubtag(int32_t & state,const char * s,int32_t len)660 _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
661 {
662     const int32_t kStart = 0;       // Start, wait for unicode_language_subtag, tkey or end
663     const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
664                                     // unicode_region_subtag, unicode_variant_subtag, tkey or end
665     const int32_t kGotScript = 2;   // Got unicode_script_subtag, wait for unicode_region_subtag,
666                                     // unicode_variant_subtag, tkey, or end
667     const int32_t kGotRegion = 3;   // Got unicode_region_subtag, wait for unicode_variant_subtag,
668                                     // tkey, or end.
669     const int32_t kGotVariant = 4;  // Got unicode_variant_subtag, wait for unicode_variant_subtag
670                                     // tkey or end.
671     const int32_t kGotTKey = -1;    // Got tkey, wait for tvalue. ERROR if stop here.
672     const int32_t kGotTValue = 6;   // Got tvalue, wait for tkey, tvalue or end
673 
674     switch (state) {
675         case kStart:
676             if (ultag_isLanguageSubtag(s, len)) {
677                 state = kGotLanguage;
678                 return TRUE;
679             }
680             if (_isTKey(s, len)) {
681                 state = kGotTKey;
682                 return TRUE;
683             }
684             return FALSE;
685         case kGotLanguage:
686             if (ultag_isScriptSubtag(s, len)) {
687                 state = kGotScript;
688                 return TRUE;
689             }
690             U_FALLTHROUGH;
691         case kGotScript:
692             if (ultag_isRegionSubtag(s, len)) {
693                 state = kGotRegion;
694                 return TRUE;
695             }
696             U_FALLTHROUGH;
697         case kGotRegion:
698             U_FALLTHROUGH;
699         case kGotVariant:
700             if (_isVariantSubtag(s, len)) {
701                 state = kGotVariant;
702                 return TRUE;
703             }
704             if (_isTKey(s, len)) {
705                 state = kGotTKey;
706                 return TRUE;
707             }
708             return FALSE;
709         case kGotTKey:
710             if (_isTValue(s, len)) {
711                 state = kGotTValue;
712                 return TRUE;
713             }
714             return FALSE;
715         case kGotTValue:
716             if (_isTKey(s, len)) {
717                 state = kGotTKey;
718                 return TRUE;
719             }
720             if (_isTValue(s, len)) {
721                 return TRUE;
722             }
723             return FALSE;
724     }
725     return FALSE;
726 }
727 
728 static UBool
_isUnicodeExtensionSubtag(int32_t & state,const char * s,int32_t len)729 _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len)
730 {
731     const int32_t kStart = 0;         // Start, wait for a key or attribute or end
732     const int32_t kGotKey = 1;        // Got a key, wait for type or key or end
733     const int32_t kGotType = 2;       // Got a type, wait for key or end
734 
735     switch (state) {
736         case kStart:
737             if (ultag_isUnicodeLocaleKey(s, len)) {
738                 state = kGotKey;
739                 return TRUE;
740             }
741             if (ultag_isUnicodeLocaleAttribute(s, len)) {
742                 return TRUE;
743             }
744             return FALSE;
745         case kGotKey:
746             if (ultag_isUnicodeLocaleKey(s, len)) {
747                 return TRUE;
748             }
749             if (_isUnicodeLocaleTypeSubtag(s, len)) {
750                 state = kGotType;
751                 return TRUE;
752             }
753             return FALSE;
754         case kGotType:
755             if (ultag_isUnicodeLocaleKey(s, len)) {
756                 state = kGotKey;
757                 return TRUE;
758             }
759             if (_isUnicodeLocaleTypeSubtag(s, len)) {
760                 return TRUE;
761             }
762             return FALSE;
763     }
764     return FALSE;
765 }
766 
767 static UBool
_isStatefulSepListOf(UBool (* test)(int32_t &,const char *,int32_t),const char * s,int32_t len)768 _isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len)
769 {
770     int32_t state = 0;
771     const char* p;
772     const char* start = s;
773     int32_t subtagLen = 0;
774 
775     if (len < 0) {
776         len = (int32_t)uprv_strlen(s);
777     }
778 
779     for (p = s; len > 0; p++, len--) {
780         if (*p == SEP) {
781             if (!test(state, start, subtagLen)) {
782                 return FALSE;
783             }
784             subtagLen = 0;
785             start = p + 1;
786         } else {
787             subtagLen++;
788         }
789     }
790 
791     if (test(state, start, subtagLen) && state >= 0) {
792         return TRUE;
793     }
794     return FALSE;
795 }
796 
797 U_CFUNC UBool
ultag_isTransformedExtensionSubtags(const char * s,int32_t len)798 ultag_isTransformedExtensionSubtags(const char* s, int32_t len)
799 {
800     return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len);
801 }
802 
803 U_CFUNC UBool
ultag_isUnicodeExtensionSubtags(const char * s,int32_t len)804 ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) {
805     return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len);
806 }
807 
808 
809 /*
810 * -------------------------------------------------
811 *
812 * Helper functions
813 *
814 * -------------------------------------------------
815 */
816 
817 static UBool
_addVariantToList(VariantListEntry ** first,VariantListEntry * var)818 _addVariantToList(VariantListEntry **first, VariantListEntry *var) {
819     UBool bAdded = TRUE;
820 
821     if (*first == NULL) {
822         var->next = NULL;
823         *first = var;
824     } else {
825         VariantListEntry *prev, *cur;
826         int32_t cmp;
827 
828         /* variants order should be preserved */
829         prev = NULL;
830         cur = *first;
831         while (TRUE) {
832             if (cur == NULL) {
833                 prev->next = var;
834                 var->next = NULL;
835                 break;
836             }
837 
838             /* Checking for duplicate variant */
839             cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant);
840             if (cmp == 0) {
841                 /* duplicated variant */
842                 bAdded = FALSE;
843                 break;
844             }
845             prev = cur;
846             cur = cur->next;
847         }
848     }
849 
850     return bAdded;
851 }
852 
853 static UBool
_addAttributeToList(AttributeListEntry ** first,AttributeListEntry * attr)854 _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) {
855     UBool bAdded = TRUE;
856 
857     if (*first == NULL) {
858         attr->next = NULL;
859         *first = attr;
860     } else {
861         AttributeListEntry *prev, *cur;
862         int32_t cmp;
863 
864         /* reorder variants in alphabetical order */
865         prev = NULL;
866         cur = *first;
867         while (TRUE) {
868             if (cur == NULL) {
869                 prev->next = attr;
870                 attr->next = NULL;
871                 break;
872             }
873             cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute);
874             if (cmp < 0) {
875                 if (prev == NULL) {
876                     *first = attr;
877                 } else {
878                     prev->next = attr;
879                 }
880                 attr->next = cur;
881                 break;
882             }
883             if (cmp == 0) {
884                 /* duplicated variant */
885                 bAdded = FALSE;
886                 break;
887             }
888             prev = cur;
889             cur = cur->next;
890         }
891     }
892 
893     return bAdded;
894 }
895 
896 
897 static UBool
_addExtensionToList(ExtensionListEntry ** first,ExtensionListEntry * ext,UBool localeToBCP)898 _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) {
899     UBool bAdded = TRUE;
900 
901     if (*first == NULL) {
902         ext->next = NULL;
903         *first = ext;
904     } else {
905         ExtensionListEntry *prev, *cur;
906         int32_t cmp;
907 
908         /* reorder variants in alphabetical order */
909         prev = NULL;
910         cur = *first;
911         while (TRUE) {
912             if (cur == NULL) {
913                 prev->next = ext;
914                 ext->next = NULL;
915                 break;
916             }
917             if (localeToBCP) {
918                 /* special handling for locale to bcp conversion */
919                 int32_t len, curlen;
920 
921                 len = (int32_t)uprv_strlen(ext->key);
922                 curlen = (int32_t)uprv_strlen(cur->key);
923 
924                 if (len == 1 && curlen == 1) {
925                     if (*(ext->key) == *(cur->key)) {
926                         cmp = 0;
927                     } else if (*(ext->key) == PRIVATEUSE) {
928                         cmp = 1;
929                     } else if (*(cur->key) == PRIVATEUSE) {
930                         cmp = -1;
931                     } else {
932                         cmp = *(ext->key) - *(cur->key);
933                     }
934                 } else if (len == 1) {
935                     cmp = *(ext->key) - LDMLEXT;
936                 } else if (curlen == 1) {
937                     cmp = LDMLEXT - *(cur->key);
938                 } else {
939                     cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
940                     /* Both are u extension keys - we need special handling for 'attribute' */
941                     if (cmp != 0) {
942                         if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) {
943                             cmp = 1;
944                         } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
945                             cmp = -1;
946                         }
947                     }
948                 }
949             } else {
950                 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
951             }
952             if (cmp < 0) {
953                 if (prev == NULL) {
954                     *first = ext;
955                 } else {
956                     prev->next = ext;
957                 }
958                 ext->next = cur;
959                 break;
960             }
961             if (cmp == 0) {
962                 /* duplicated extension key */
963                 bAdded = FALSE;
964                 break;
965             }
966             prev = cur;
967             cur = cur->next;
968         }
969     }
970 
971     return bAdded;
972 }
973 
974 static void
_initializeULanguageTag(ULanguageTag * langtag)975 _initializeULanguageTag(ULanguageTag* langtag) {
976     int32_t i;
977 
978     langtag->buf = NULL;
979 
980     langtag->language = EMPTY;
981     for (i = 0; i < MAXEXTLANG; i++) {
982         langtag->extlang[i] = NULL;
983     }
984 
985     langtag->script = EMPTY;
986     langtag->region = EMPTY;
987 
988     langtag->variants = NULL;
989     langtag->extensions = NULL;
990 
991     langtag->legacy = EMPTY;
992     langtag->privateuse = EMPTY;
993 }
994 
995 static void
_appendLanguageToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)996 _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
997     char buf[ULOC_LANG_CAPACITY];
998     UErrorCode tmpStatus = U_ZERO_ERROR;
999     int32_t len, i;
1000 
1001     if (U_FAILURE(*status)) {
1002         return;
1003     }
1004 
1005     len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
1006     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1007         if (strict) {
1008             *status = U_ILLEGAL_ARGUMENT_ERROR;
1009             return;
1010         }
1011         len = 0;
1012     }
1013 
1014     /* Note: returned language code is in lower case letters */
1015 
1016     if (len == 0) {
1017         sink.Append(LANG_UND, LANG_UND_LEN);
1018     } else if (!ultag_isLanguageSubtag(buf, len)) {
1019             /* invalid language code */
1020         if (strict) {
1021             *status = U_ILLEGAL_ARGUMENT_ERROR;
1022             return;
1023         }
1024         sink.Append(LANG_UND, LANG_UND_LEN);
1025     } else {
1026         /* resolve deprecated */
1027         for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
1028             // 2-letter deprecated subtags are listede before 3-letter
1029             // ones in DEPRECATEDLANGS[]. Get out of loop on coming
1030             // across the 1st 3-letter subtag, if the input is a 2-letter code.
1031             // to avoid continuing to try when there's no match.
1032             if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
1033             if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
1034                 uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
1035                 len = (int32_t)uprv_strlen(buf);
1036                 break;
1037             }
1038         }
1039         sink.Append(buf, len);
1040     }
1041 }
1042 
1043 static void
_appendScriptToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1044 _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1045     char buf[ULOC_SCRIPT_CAPACITY];
1046     UErrorCode tmpStatus = U_ZERO_ERROR;
1047     int32_t len;
1048 
1049     if (U_FAILURE(*status)) {
1050         return;
1051     }
1052 
1053     len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
1054     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1055         if (strict) {
1056             *status = U_ILLEGAL_ARGUMENT_ERROR;
1057         }
1058         return;
1059     }
1060 
1061     if (len > 0) {
1062         if (!ultag_isScriptSubtag(buf, len)) {
1063             /* invalid script code */
1064             if (strict) {
1065                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1066             }
1067             return;
1068         } else {
1069             sink.Append("-", 1);
1070             sink.Append(buf, len);
1071         }
1072     }
1073 }
1074 
1075 static void
_appendRegionToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1076 _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1077     char buf[ULOC_COUNTRY_CAPACITY];
1078     UErrorCode tmpStatus = U_ZERO_ERROR;
1079     int32_t len;
1080 
1081     if (U_FAILURE(*status)) {
1082         return;
1083     }
1084 
1085     len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
1086     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1087         if (strict) {
1088             *status = U_ILLEGAL_ARGUMENT_ERROR;
1089         }
1090         return;
1091     }
1092 
1093     if (len > 0) {
1094         if (!ultag_isRegionSubtag(buf, len)) {
1095             /* invalid region code */
1096             if (strict) {
1097                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1098             }
1099             return;
1100         } else {
1101             sink.Append("-", 1);
1102             /* resolve deprecated */
1103             for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
1104                 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
1105                     uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
1106                     len = (int32_t)uprv_strlen(buf);
1107                     break;
1108                 }
1109             }
1110             sink.Append(buf, len);
1111         }
1112     }
1113 }
1114 
_sortVariants(VariantListEntry * first)1115 static void _sortVariants(VariantListEntry* first) {
1116     for (VariantListEntry* var1 = first; var1 != NULL; var1 = var1->next) {
1117         for (VariantListEntry* var2 = var1->next; var2 != NULL; var2 = var2->next) {
1118             // Swap var1->variant and var2->variant.
1119             if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) {
1120                 const char* temp = var1->variant;
1121                 var1->variant = var2->variant;
1122                 var2->variant = temp;
1123             }
1124         }
1125     }
1126 }
1127 
1128 static void
_appendVariantsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool * hadPosix,UErrorCode * status)1129 _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) {
1130     char buf[ULOC_FULLNAME_CAPACITY];
1131     UErrorCode tmpStatus = U_ZERO_ERROR;
1132     int32_t len, i;
1133 
1134     if (U_FAILURE(*status)) {
1135         return;
1136     }
1137 
1138     len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1139     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1140         if (strict) {
1141             *status = U_ILLEGAL_ARGUMENT_ERROR;
1142         }
1143         return;
1144     }
1145 
1146     if (len > 0) {
1147         char *p, *pVar;
1148         UBool bNext = TRUE;
1149         VariantListEntry *var;
1150         VariantListEntry *varFirst = NULL;
1151 
1152         pVar = NULL;
1153         p = buf;
1154         while (bNext) {
1155             if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1156                 if (*p == 0) {
1157                     bNext = FALSE;
1158                 } else {
1159                     *p = 0; /* terminate */
1160                 }
1161                 if (pVar == NULL) {
1162                     if (strict) {
1163                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1164                         break;
1165                     }
1166                     /* ignore empty variant */
1167                 } else {
1168                     /* ICU uses upper case letters for variants, but
1169                        the canonical format is lowercase in BCP47 */
1170                     for (i = 0; *(pVar + i) != 0; i++) {
1171                         *(pVar + i) = uprv_tolower(*(pVar + i));
1172                     }
1173 
1174                     /* validate */
1175                     if (_isVariantSubtag(pVar, -1)) {
1176                         if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) {
1177                             /* emit the variant to the list */
1178                             var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
1179                             if (var == NULL) {
1180                                 *status = U_MEMORY_ALLOCATION_ERROR;
1181                                 break;
1182                             }
1183                             var->variant = pVar;
1184                             if (!_addVariantToList(&varFirst, var)) {
1185                                 /* duplicated variant */
1186                                 uprv_free(var);
1187                                 if (strict) {
1188                                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1189                                     break;
1190                                 }
1191                             }
1192                         } else {
1193                             /* Special handling for POSIX variant, need to remember that we had it and then */
1194                             /* treat it like an extension later. */
1195                             *hadPosix = TRUE;
1196                         }
1197                     } else if (strict) {
1198                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1199                         break;
1200                     } else if (_isPrivateuseValueSubtag(pVar, -1)) {
1201                         /* Handle private use subtags separately */
1202                         break;
1203                     }
1204                 }
1205                 /* reset variant starting position */
1206                 pVar = NULL;
1207             } else if (pVar == NULL) {
1208                 pVar = p;
1209             }
1210             p++;
1211         }
1212 
1213         if (U_SUCCESS(*status)) {
1214             if (varFirst != NULL) {
1215                 int32_t varLen;
1216 
1217                 /* per UTS35, we should sort the variants */
1218                 _sortVariants(varFirst);
1219 
1220                 /* write out validated/normalized variants to the target */
1221                 var = varFirst;
1222                 while (var != NULL) {
1223                     sink.Append("-", 1);
1224                     varLen = (int32_t)uprv_strlen(var->variant);
1225                     sink.Append(var->variant, varLen);
1226                     var = var->next;
1227                 }
1228             }
1229         }
1230 
1231         /* clean up */
1232         var = varFirst;
1233         while (var != NULL) {
1234             VariantListEntry *tmpVar = var->next;
1235             uprv_free(var);
1236             var = tmpVar;
1237         }
1238 
1239         if (U_FAILURE(*status)) {
1240             return;
1241         }
1242     }
1243 }
1244 
1245 static void
_appendKeywordsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1246 _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1247     char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
1248     int32_t attrBufLength = 0;
1249 
1250     icu::MemoryPool<AttributeListEntry> attrPool;
1251     icu::MemoryPool<ExtensionListEntry> extPool;
1252     icu::MemoryPool<icu::CharString> strPool;
1253 
1254     icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status));
1255     if (U_FAILURE(*status) && !hadPosix) {
1256         return;
1257     }
1258     if (keywordEnum.isValid() || hadPosix) {
1259         /* reorder extensions */
1260         int32_t len;
1261         const char *key;
1262         ExtensionListEntry *firstExt = NULL;
1263         ExtensionListEntry *ext;
1264         AttributeListEntry *firstAttr = NULL;
1265         AttributeListEntry *attr;
1266         icu::MemoryPool<icu::CharString> extBufPool;
1267         const char *bcpKey=nullptr, *bcpValue=nullptr;
1268         UErrorCode tmpStatus = U_ZERO_ERROR;
1269         int32_t keylen;
1270         UBool isBcpUExt;
1271 
1272         while (TRUE) {
1273             key = uenum_next(keywordEnum.getAlias(), NULL, status);
1274             if (key == NULL) {
1275                 break;
1276             }
1277 
1278             icu::CharString buf;
1279             {
1280                 icu::CharStringByteSink sink(&buf);
1281                 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
1282             }
1283             len = buf.length();
1284 
1285             if (U_FAILURE(tmpStatus)) {
1286                 if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
1287                     *status = U_MEMORY_ALLOCATION_ERROR;
1288                     break;
1289                 }
1290                 if (strict) {
1291                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1292                     break;
1293                 }
1294                 /* ignore this keyword */
1295                 tmpStatus = U_ZERO_ERROR;
1296                 continue;
1297             }
1298 
1299             keylen = (int32_t)uprv_strlen(key);
1300             isBcpUExt = (keylen > 1);
1301 
1302             /* special keyword used for representing Unicode locale attributes */
1303             if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) {
1304                 if (len > 0) {
1305                     int32_t i = 0;
1306                     while (TRUE) {
1307                         attrBufLength = 0;
1308                         for (; i < len; i++) {
1309                             if (buf[i] != '-') {
1310                                 attrBuf[attrBufLength++] = buf[i];
1311                             } else {
1312                                 i++;
1313                                 break;
1314                             }
1315                         }
1316                         if (attrBufLength > 0) {
1317                             attrBuf[attrBufLength] = 0;
1318 
1319                         } else if (i >= len){
1320                             break;
1321                         }
1322 
1323                         /* create AttributeListEntry */
1324                         attr = attrPool.create();
1325                         if (attr == NULL) {
1326                             *status = U_MEMORY_ALLOCATION_ERROR;
1327                             break;
1328                         }
1329                         icu::CharString* attrValue =
1330                                 strPool.create(attrBuf, attrBufLength, *status);
1331                         if (attrValue == NULL) {
1332                             *status = U_MEMORY_ALLOCATION_ERROR;
1333                             break;
1334                         }
1335                         if (U_FAILURE(*status)) {
1336                             break;
1337                         }
1338                         attr->attribute = attrValue->data();
1339 
1340                         if (!_addAttributeToList(&firstAttr, attr)) {
1341                             if (strict) {
1342                                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1343                                 break;
1344                             }
1345                         }
1346                     }
1347                     /* for a place holder ExtensionListEntry */
1348                     bcpKey = LOCALE_ATTRIBUTE_KEY;
1349                     bcpValue = NULL;
1350                 }
1351             } else if (isBcpUExt) {
1352                 bcpKey = uloc_toUnicodeLocaleKey(key);
1353                 if (bcpKey == NULL) {
1354                     if (strict) {
1355                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1356                         break;
1357                     }
1358                     continue;
1359                 }
1360 
1361                 /* we've checked buf is null-terminated above */
1362                 bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
1363                 if (bcpValue == NULL) {
1364                     if (strict) {
1365                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1366                         break;
1367                     }
1368                     continue;
1369                 }
1370                 if (bcpValue == buf.data()) {
1371                     /*
1372                     When uloc_toUnicodeLocaleType(key, buf) returns the
1373                     input value as is, the value is well-formed, but has
1374                     no known mapping. This implementation normalizes the
1375                     value to lower case
1376                     */
1377                     icu::CharString* extBuf = extBufPool.create(buf, tmpStatus);
1378 
1379                     if (extBuf == nullptr) {
1380                         *status = U_MEMORY_ALLOCATION_ERROR;
1381                         break;
1382                     }
1383                     if (U_FAILURE(tmpStatus)) {
1384                         *status = tmpStatus;
1385                         break;
1386                     }
1387 
1388                     T_CString_toLowerCase(extBuf->data());
1389                     bcpValue = extBuf->data();
1390                 }
1391             } else {
1392                 if (*key == PRIVATEUSE) {
1393                     if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) {
1394                         if (strict) {
1395                             *status = U_ILLEGAL_ARGUMENT_ERROR;
1396                             break;
1397                         }
1398                         continue;
1399                     }
1400                 } else {
1401                     if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) {
1402                         if (strict) {
1403                             *status = U_ILLEGAL_ARGUMENT_ERROR;
1404                             break;
1405                         }
1406                         continue;
1407                     }
1408                 }
1409                 bcpKey = key;
1410                 icu::CharString* extBuf =
1411                     extBufPool.create(buf.data(), len, tmpStatus);
1412                 if (extBuf == nullptr) {
1413                     *status = U_MEMORY_ALLOCATION_ERROR;
1414                     break;
1415                 }
1416                 if (U_FAILURE(tmpStatus)) {
1417                     *status = tmpStatus;
1418                     break;
1419                 }
1420                 bcpValue = extBuf->data();
1421             }
1422 
1423             /* create ExtensionListEntry */
1424             ext = extPool.create();
1425             if (ext == NULL) {
1426                 *status = U_MEMORY_ALLOCATION_ERROR;
1427                 break;
1428             }
1429             ext->key = bcpKey;
1430             ext->value = bcpValue;
1431 
1432             if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1433                 if (strict) {
1434                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1435                     break;
1436                 }
1437             }
1438         }
1439 
1440         /* Special handling for POSIX variant - add the keywords for POSIX */
1441         if (hadPosix) {
1442             /* create ExtensionListEntry for POSIX */
1443             ext = extPool.create();
1444             if (ext == NULL) {
1445                 *status = U_MEMORY_ALLOCATION_ERROR;
1446                 return;
1447             }
1448             ext->key = POSIX_KEY;
1449             ext->value = POSIX_VALUE;
1450 
1451             if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1452                 // Silently ignore errors.
1453             }
1454         }
1455 
1456         if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) {
1457             UBool startLDMLExtension = FALSE;
1458             for (ext = firstExt; ext; ext = ext->next) {
1459                 if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
1460                     /* first LDML u singlton extension */
1461                    sink.Append("-u", 2);
1462                    startLDMLExtension = TRUE;
1463                 }
1464 
1465                 /* write out the sorted BCP47 attributes, extensions and private use */
1466                 if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
1467                     /* write the value for the attributes */
1468                     for (attr = firstAttr; attr; attr = attr->next) {
1469                         sink.Append("-", 1);
1470                         sink.Append(
1471                                 attr->attribute, static_cast<int32_t>(uprv_strlen(attr->attribute)));
1472                     }
1473                 } else {
1474                     sink.Append("-", 1);
1475                     sink.Append(ext->key, static_cast<int32_t>(uprv_strlen(ext->key)));
1476                     if (uprv_strcmp(ext->value, "true") != 0 &&
1477                         uprv_strcmp(ext->value, "yes") != 0) {
1478                       sink.Append("-", 1);
1479                       sink.Append(ext->value, static_cast<int32_t>(uprv_strlen(ext->value)));
1480                     }
1481                 }
1482             }
1483         }
1484     }
1485 }
1486 
1487 /**
1488  * Append keywords parsed from LDML extension value
1489  * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
1490  * Note: char* buf is used for storing keywords
1491  */
1492 static void
_appendLDMLExtensionAsKeywords(const char * ldmlext,ExtensionListEntry ** appendTo,icu::MemoryPool<ExtensionListEntry> & extPool,icu::MemoryPool<icu::CharString> & kwdBuf,UBool * posixVariant,UErrorCode * status)1493 _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool<ExtensionListEntry>& extPool, icu::MemoryPool<icu::CharString>& kwdBuf, UBool *posixVariant, UErrorCode *status) {
1494     const char *pTag;   /* beginning of current subtag */
1495     const char *pKwds;  /* beginning of key-type pairs */
1496     UBool variantExists = *posixVariant;
1497 
1498     ExtensionListEntry *kwdFirst = NULL;    /* first LDML keyword */
1499     ExtensionListEntry *kwd, *nextKwd;
1500 
1501     int32_t len;
1502 
1503     /* Reset the posixVariant value */
1504     *posixVariant = FALSE;
1505 
1506     pTag = ldmlext;
1507     pKwds = NULL;
1508 
1509     {
1510         AttributeListEntry *attrFirst = NULL;   /* first attribute */
1511         AttributeListEntry *attr, *nextAttr;
1512 
1513         char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1514         int32_t attrBufIdx = 0;
1515 
1516         icu::MemoryPool<AttributeListEntry> attrPool;
1517 
1518         /* Iterate through u extension attributes */
1519         while (*pTag) {
1520             /* locate next separator char */
1521             for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1522 
1523             if (ultag_isUnicodeLocaleKey(pTag, len)) {
1524                 pKwds = pTag;
1525                 break;
1526             }
1527 
1528             /* add this attribute to the list */
1529             attr = attrPool.create();
1530             if (attr == NULL) {
1531                 *status = U_MEMORY_ALLOCATION_ERROR;
1532                 return;
1533             }
1534 
1535             if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) {
1536                 uprv_memcpy(&attrBuf[attrBufIdx], pTag, len);
1537                 attrBuf[attrBufIdx + len] = 0;
1538                 attr->attribute = &attrBuf[attrBufIdx];
1539                 attrBufIdx += (len + 1);
1540             } else {
1541                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1542                 return;
1543             }
1544 
1545             // duplicate attribute is ignored, causes no error.
1546             _addAttributeToList(&attrFirst, attr);
1547 
1548             /* next tag */
1549             pTag += len;
1550             if (*pTag) {
1551                 /* next to the separator */
1552                 pTag++;
1553             }
1554         }
1555 
1556         if (attrFirst) {
1557             /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
1558 
1559             kwd = extPool.create();
1560             if (kwd == NULL) {
1561                 *status = U_MEMORY_ALLOCATION_ERROR;
1562                 return;
1563             }
1564 
1565             icu::CharString* value = kwdBuf.create();
1566             if (value == NULL) {
1567                 *status = U_MEMORY_ALLOCATION_ERROR;
1568                 return;
1569             }
1570 
1571             /* attribute subtags sorted in alphabetical order as type */
1572             attr = attrFirst;
1573             while (attr != NULL) {
1574                 nextAttr = attr->next;
1575                 if (attr != attrFirst) {
1576                     value->append('-', *status);
1577                 }
1578                 value->append(attr->attribute, *status);
1579                 attr = nextAttr;
1580             }
1581             if (U_FAILURE(*status)) {
1582                 return;
1583             }
1584 
1585             kwd->key = LOCALE_ATTRIBUTE_KEY;
1586             kwd->value = value->data();
1587 
1588             if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1589                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1590                 return;
1591             }
1592         }
1593     }
1594 
1595     if (pKwds) {
1596         const char *pBcpKey = NULL;     /* u extenstion key subtag */
1597         const char *pBcpType = NULL;    /* beginning of u extension type subtag(s) */
1598         int32_t bcpKeyLen = 0;
1599         int32_t bcpTypeLen = 0;
1600         UBool isDone = FALSE;
1601 
1602         pTag = pKwds;
1603         /* BCP47 representation of LDML key/type pairs */
1604         while (!isDone) {
1605             const char *pNextBcpKey = NULL;
1606             int32_t nextBcpKeyLen = 0;
1607             UBool emitKeyword = FALSE;
1608 
1609             if (*pTag) {
1610                 /* locate next separator char */
1611                 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1612 
1613                 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1614                     if (pBcpKey) {
1615                         emitKeyword = TRUE;
1616                         pNextBcpKey = pTag;
1617                         nextBcpKeyLen = len;
1618                     } else {
1619                         pBcpKey = pTag;
1620                         bcpKeyLen = len;
1621                     }
1622                 } else {
1623                     U_ASSERT(pBcpKey != NULL);
1624                     /* within LDML type subtags */
1625                     if (pBcpType) {
1626                         bcpTypeLen += (len + 1);
1627                     } else {
1628                         pBcpType = pTag;
1629                         bcpTypeLen = len;
1630                     }
1631                 }
1632 
1633                 /* next tag */
1634                 pTag += len;
1635                 if (*pTag) {
1636                     /* next to the separator */
1637                     pTag++;
1638                 }
1639             } else {
1640                 /* processing last one */
1641                 emitKeyword = TRUE;
1642                 isDone = TRUE;
1643             }
1644 
1645             if (emitKeyword) {
1646                 const char *pKey = NULL;    /* LDML key */
1647                 const char *pType = NULL;   /* LDML type */
1648 
1649                 char bcpKeyBuf[3];          /* BCP key length is always 2 for now */
1650 
1651                 U_ASSERT(pBcpKey != NULL);
1652 
1653                 if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) {
1654                     /* the BCP key is invalid */
1655                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1656                     return;
1657                 }
1658                 U_ASSERT(bcpKeyLen <= 2);
1659 
1660                 uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen);
1661                 bcpKeyBuf[bcpKeyLen] = 0;
1662 
1663                 /* u extension key to LDML key */
1664                 pKey = uloc_toLegacyKey(bcpKeyBuf);
1665                 if (pKey == NULL) {
1666                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1667                     return;
1668                 }
1669                 if (pKey == bcpKeyBuf) {
1670                     /*
1671                     The key returned by toLegacyKey points to the input buffer.
1672                     We normalize the result key to lower case.
1673                     */
1674                     T_CString_toLowerCase(bcpKeyBuf);
1675                     icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status);
1676                     if (key == NULL) {
1677                         *status = U_MEMORY_ALLOCATION_ERROR;
1678                         return;
1679                     }
1680                     if (U_FAILURE(*status)) {
1681                         return;
1682                     }
1683                     pKey = key->data();
1684                 }
1685 
1686                 if (pBcpType) {
1687                     char bcpTypeBuf[128];       /* practically long enough even considering multiple subtag type */
1688                     if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) {
1689                         /* the BCP type is too long */
1690                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1691                         return;
1692                     }
1693 
1694                     uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen);
1695                     bcpTypeBuf[bcpTypeLen] = 0;
1696 
1697                     /* BCP type to locale type */
1698                     pType = uloc_toLegacyType(pKey, bcpTypeBuf);
1699                     if (pType == NULL) {
1700                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1701                         return;
1702                     }
1703                     if (pType == bcpTypeBuf) {
1704                         /*
1705                         The type returned by toLegacyType points to the input buffer.
1706                         We normalize the result type to lower case.
1707                         */
1708                         /* normalize to lower case */
1709                         T_CString_toLowerCase(bcpTypeBuf);
1710                         icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status);
1711                         if (type == NULL) {
1712                             *status = U_MEMORY_ALLOCATION_ERROR;
1713                             return;
1714                         }
1715                         if (U_FAILURE(*status)) {
1716                             return;
1717                         }
1718                         pType = type->data();
1719                     }
1720                 } else {
1721                     /* typeless - default type value is "yes" */
1722                     pType = LOCALE_TYPE_YES;
1723                 }
1724 
1725                 /* Special handling for u-va-posix, since we want to treat this as a variant,
1726                    not as a keyword */
1727                 if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) {
1728                     *posixVariant = TRUE;
1729                 } else {
1730                     /* create an ExtensionListEntry for this keyword */
1731                     kwd = extPool.create();
1732                     if (kwd == NULL) {
1733                         *status = U_MEMORY_ALLOCATION_ERROR;
1734                         return;
1735                     }
1736 
1737                     kwd->key = pKey;
1738                     kwd->value = pType;
1739 
1740                     if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1741                         // duplicate keyword is allowed, Only the first
1742                         // is honored.
1743                     }
1744                 }
1745 
1746                 pBcpKey = pNextBcpKey;
1747                 bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0;
1748                 pBcpType = NULL;
1749                 bcpTypeLen = 0;
1750             }
1751         }
1752     }
1753 
1754     kwd = kwdFirst;
1755     while (kwd != NULL) {
1756         nextKwd = kwd->next;
1757         _addExtensionToList(appendTo, kwd, FALSE);
1758         kwd = nextKwd;
1759     }
1760 }
1761 
1762 
1763 static void
_appendKeywords(ULanguageTag * langtag,icu::ByteSink & sink,UErrorCode * status)1764 _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) {
1765     int32_t i, n;
1766     int32_t len;
1767     ExtensionListEntry *kwdFirst = NULL;
1768     ExtensionListEntry *kwd;
1769     const char *key, *type;
1770     icu::MemoryPool<ExtensionListEntry> extPool;
1771     icu::MemoryPool<icu::CharString> kwdBuf;
1772     UBool posixVariant = FALSE;
1773 
1774     if (U_FAILURE(*status)) {
1775         return;
1776     }
1777 
1778     /* Determine if variants already exists */
1779     if (ultag_getVariantsSize(langtag)) {
1780         posixVariant = TRUE;
1781     }
1782 
1783     n = ultag_getExtensionsSize(langtag);
1784 
1785     /* resolve locale keywords and reordering keys */
1786     for (i = 0; i < n; i++) {
1787         key = ultag_getExtensionKey(langtag, i);
1788         type = ultag_getExtensionValue(langtag, i);
1789         if (*key == LDMLEXT) {
1790             _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
1791             if (U_FAILURE(*status)) {
1792                 break;
1793             }
1794         } else {
1795             kwd = extPool.create();
1796             if (kwd == NULL) {
1797                 *status = U_MEMORY_ALLOCATION_ERROR;
1798                 break;
1799             }
1800             kwd->key = key;
1801             kwd->value = type;
1802             if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1803                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1804                 break;
1805             }
1806         }
1807     }
1808 
1809     if (U_SUCCESS(*status)) {
1810         type = ultag_getPrivateUse(langtag);
1811         if ((int32_t)uprv_strlen(type) > 0) {
1812             /* add private use as a keyword */
1813             kwd = extPool.create();
1814             if (kwd == NULL) {
1815                 *status = U_MEMORY_ALLOCATION_ERROR;
1816             } else {
1817                 kwd->key = PRIVATEUSE_KEY;
1818                 kwd->value = type;
1819                 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1820                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1821                 }
1822             }
1823         }
1824     }
1825 
1826     /* If a POSIX variant was in the extensions, write it out before writing the keywords. */
1827 
1828     if (U_SUCCESS(*status) && posixVariant) {
1829         len = (int32_t) uprv_strlen(_POSIX);
1830         sink.Append(_POSIX, len);
1831     }
1832 
1833     if (U_SUCCESS(*status) && kwdFirst != NULL) {
1834         /* write out the sorted keywords */
1835         UBool firstValue = TRUE;
1836         kwd = kwdFirst;
1837         do {
1838             if (firstValue) {
1839                 sink.Append("@", 1);
1840                 firstValue = FALSE;
1841             } else {
1842                 sink.Append(";", 1);
1843             }
1844 
1845             /* key */
1846             len = (int32_t)uprv_strlen(kwd->key);
1847             sink.Append(kwd->key, len);
1848             sink.Append("=", 1);
1849 
1850             /* type */
1851             len = (int32_t)uprv_strlen(kwd->value);
1852             sink.Append(kwd->value, len);
1853 
1854             kwd = kwd->next;
1855         } while (kwd);
1856     }
1857 }
1858 
1859 static void
_appendPrivateuseToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1860 _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1861     (void)hadPosix;
1862     char buf[ULOC_FULLNAME_CAPACITY];
1863     char tmpAppend[ULOC_FULLNAME_CAPACITY];
1864     UErrorCode tmpStatus = U_ZERO_ERROR;
1865     int32_t len, i;
1866     int32_t reslen = 0;
1867     int32_t capacity = sizeof tmpAppend;
1868 
1869     if (U_FAILURE(*status)) {
1870         return;
1871     }
1872 
1873     len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1874     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1875         if (strict) {
1876             *status = U_ILLEGAL_ARGUMENT_ERROR;
1877         }
1878         return;
1879     }
1880 
1881     if (len > 0) {
1882         char *p, *pPriv;
1883         UBool bNext = TRUE;
1884         UBool firstValue = TRUE;
1885         UBool writeValue;
1886 
1887         pPriv = NULL;
1888         p = buf;
1889         while (bNext) {
1890             writeValue = FALSE;
1891             if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1892                 if (*p == 0) {
1893                     bNext = FALSE;
1894                 } else {
1895                     *p = 0; /* terminate */
1896                 }
1897                 if (pPriv != NULL) {
1898                     /* Private use in the canonical format is lowercase in BCP47 */
1899                     for (i = 0; *(pPriv + i) != 0; i++) {
1900                         *(pPriv + i) = uprv_tolower(*(pPriv + i));
1901                     }
1902 
1903                     /* validate */
1904                     if (_isPrivateuseValueSubtag(pPriv, -1)) {
1905                         if (firstValue) {
1906                             if (!_isVariantSubtag(pPriv, -1)) {
1907                                 writeValue = TRUE;
1908                             }
1909                         } else {
1910                             writeValue = TRUE;
1911                         }
1912                     } else if (strict) {
1913                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1914                         break;
1915                     } else {
1916                         break;
1917                     }
1918 
1919                     if (writeValue) {
1920                         if (reslen < capacity) {
1921                             tmpAppend[reslen++] = SEP;
1922                         }
1923 
1924                         if (firstValue) {
1925                             if (reslen < capacity) {
1926                                 tmpAppend[reslen++] = *PRIVATEUSE_KEY;
1927                             }
1928 
1929                             if (reslen < capacity) {
1930                                 tmpAppend[reslen++] = SEP;
1931                             }
1932 
1933                             len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
1934                             if (reslen < capacity) {
1935                                 uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
1936                             }
1937                             reslen += len;
1938 
1939                             if (reslen < capacity) {
1940                                 tmpAppend[reslen++] = SEP;
1941                             }
1942 
1943                             firstValue = FALSE;
1944                         }
1945 
1946                         len = (int32_t)uprv_strlen(pPriv);
1947                         if (reslen < capacity) {
1948                             uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
1949                         }
1950                         reslen += len;
1951                     }
1952                 }
1953                 /* reset private use starting position */
1954                 pPriv = NULL;
1955             } else if (pPriv == NULL) {
1956                 pPriv = p;
1957             }
1958             p++;
1959         }
1960 
1961         if (U_FAILURE(*status)) {
1962             return;
1963         }
1964     }
1965 
1966     if (U_SUCCESS(*status)) {
1967         len = reslen;
1968         sink.Append(tmpAppend, len);
1969     }
1970 }
1971 
1972 /*
1973 * -------------------------------------------------
1974 *
1975 * ultag_ functions
1976 *
1977 * -------------------------------------------------
1978 */
1979 
1980 /* Bit flags used by the parser */
1981 #define LANG 0x0001
1982 #define EXTL 0x0002
1983 #define SCRT 0x0004
1984 #define REGN 0x0008
1985 #define VART 0x0010
1986 #define EXTS 0x0020
1987 #define EXTV 0x0040
1988 #define PRIV 0x0080
1989 
1990 /**
1991  * Ticket #12705 - The optimizer in Visual Studio 2015 Update 3 has problems optimizing this function.
1992  * As a work-around, optimization is disabled for this function on VS2015 and VS2017.
1993  * This work-around should be removed once the following versions of Visual Studio are no
1994  * longer supported: All versions of VS2015/VS2017, and versions of VS2019 below 16.4.
1995  */
1996 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
1997 #pragma optimize( "", off )
1998 #endif
1999 
2000 static ULanguageTag*
ultag_parse(const char * tag,int32_t tagLen,int32_t * parsedLen,UErrorCode * status)2001 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) {
2002     char *tagBuf;
2003     int16_t next;
2004     char *pSubtag, *pNext, *pLastGoodPosition;
2005     int32_t subtagLen;
2006     int32_t extlangIdx;
2007     ExtensionListEntry *pExtension;
2008     char *pExtValueSubtag, *pExtValueSubtagEnd;
2009     int32_t i;
2010     UBool privateuseVar = FALSE;
2011     int32_t legacyLen = 0;
2012 
2013     if (parsedLen != NULL) {
2014         *parsedLen = 0;
2015     }
2016 
2017     if (U_FAILURE(*status)) {
2018         return NULL;
2019     }
2020 
2021     if (tagLen < 0) {
2022         tagLen = (int32_t)uprv_strlen(tag);
2023     }
2024 
2025     /* copy the entire string */
2026     tagBuf = (char*)uprv_malloc(tagLen + 1);
2027     if (tagBuf == NULL) {
2028         *status = U_MEMORY_ALLOCATION_ERROR;
2029         return NULL;
2030     }
2031     uprv_memcpy(tagBuf, tag, tagLen);
2032     *(tagBuf + tagLen) = 0;
2033 
2034     /* create a ULanguageTag */
2035     icu::LocalULanguageTagPointer t(
2036             (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)));
2037     if (t.isNull()) {
2038         uprv_free(tagBuf);
2039         *status = U_MEMORY_ALLOCATION_ERROR;
2040         return NULL;
2041     }
2042     _initializeULanguageTag(t.getAlias());
2043     t->buf = tagBuf;
2044 
2045     if (tagLen < MINLEN) {
2046         /* the input tag is too short - return empty ULanguageTag */
2047         return t.orphan();
2048     }
2049 
2050     size_t parsedLenDelta = 0;
2051     // Legacy tag will be consider together. Legacy tag with intervening
2052     // script and region such as art-DE-lojban or art-Latn-lojban won't be
2053     // matched.
2054     /* check if the tag is legacy */
2055     for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
2056         int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
2057         if (tagLen < checkLegacyLen) {
2058             continue;
2059         }
2060         if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
2061             // make sure next char is '-'.
2062             continue;
2063         }
2064         if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
2065             int32_t newTagLength;
2066 
2067             legacyLen = checkLegacyLen;  /* back up for output parsedLen */
2068             int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
2069             newTagLength = replacementLen + tagLen - checkLegacyLen;
2070             if (tagLen < newTagLength) {
2071                 uprv_free(tagBuf);
2072                 tagBuf = (char*)uprv_malloc(newTagLength + 1);
2073                 if (tagBuf == NULL) {
2074                     *status = U_MEMORY_ALLOCATION_ERROR;
2075                     return NULL;
2076                 }
2077                 t->buf = tagBuf;
2078                 tagLen = newTagLength;
2079             }
2080             parsedLenDelta = checkLegacyLen - replacementLen;
2081             uprv_strcpy(t->buf, LEGACY[i + 1]);
2082             if (checkLegacyLen != tagLen) {
2083                 uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen);
2084             }
2085             break;
2086         }
2087     }
2088 
2089     if (legacyLen == 0) {
2090         for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
2091             const char* redundantTag = REDUNDANT[i];
2092             size_t redundantTagLen = uprv_strlen(redundantTag);
2093             // The preferred tag for a redundant tag is always shorter than redundant
2094             // tag. A redundant tag may or may not be followed by other subtags.
2095             // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
2096             if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) {
2097                 const char* redundantTagEnd = tagBuf + redundantTagLen;
2098                 if (*redundantTagEnd  == '\0' || *redundantTagEnd == SEP) {
2099                     const char* preferredTag = REDUNDANT[i + 1];
2100                     size_t preferredTagLen = uprv_strlen(preferredTag);
2101                     uprv_strncpy(t->buf, preferredTag, preferredTagLen);
2102                     if (*redundantTagEnd == SEP) {
2103                         uprv_memmove(tagBuf + preferredTagLen,
2104                                      redundantTagEnd,
2105                                      tagLen - redundantTagLen + 1);
2106                     } else {
2107                         tagBuf[preferredTagLen] = '\0';
2108                     }
2109                     // parsedLen should be the length of the input
2110                     // before redundantTag is replaced by preferredTag.
2111                     // Save the delta to add it back later.
2112                     parsedLenDelta = redundantTagLen - preferredTagLen;
2113                     break;
2114                 }
2115             }
2116         }
2117     }
2118 
2119     /*
2120      * langtag      =   language
2121      *                  ["-" script]
2122      *                  ["-" region]
2123      *                  *("-" variant)
2124      *                  *("-" extension)
2125      *                  ["-" privateuse]
2126      */
2127 
2128     next = LANG | PRIV;
2129     pNext = pLastGoodPosition = tagBuf;
2130     extlangIdx = 0;
2131     pExtension = NULL;
2132     pExtValueSubtag = NULL;
2133     pExtValueSubtagEnd = NULL;
2134 
2135     while (pNext) {
2136         char *pSep;
2137 
2138         pSubtag = pNext;
2139 
2140         /* locate next separator char */
2141         pSep = pSubtag;
2142         while (*pSep) {
2143             if (*pSep == SEP) {
2144                 break;
2145             }
2146             pSep++;
2147         }
2148         if (*pSep == 0) {
2149             /* last subtag */
2150             pNext = NULL;
2151         } else {
2152             pNext = pSep + 1;
2153         }
2154         subtagLen = (int32_t)(pSep - pSubtag);
2155 
2156         if (next & LANG) {
2157             if (ultag_isLanguageSubtag(pSubtag, subtagLen)) {
2158                 *pSep = 0;  /* terminate */
2159                 // TODO: move deprecated language code handling here.
2160                 t->language = T_CString_toLowerCase(pSubtag);
2161 
2162                 pLastGoodPosition = pSep;
2163                 next = SCRT | REGN | VART | EXTS | PRIV;
2164                 if (subtagLen <= 3)
2165                   next |= EXTL;
2166                 continue;
2167             }
2168         }
2169         if (next & EXTL) {
2170             if (_isExtlangSubtag(pSubtag, subtagLen)) {
2171                 *pSep = 0;
2172                 t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag);
2173 
2174                 pLastGoodPosition = pSep;
2175                 if (extlangIdx < 3) {
2176                     next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
2177                 } else {
2178                     next = SCRT | REGN | VART | EXTS | PRIV;
2179                 }
2180                 continue;
2181             }
2182         }
2183         if (next & SCRT) {
2184             if (ultag_isScriptSubtag(pSubtag, subtagLen)) {
2185                 char *p = pSubtag;
2186 
2187                 *pSep = 0;
2188 
2189                 /* to title case */
2190                 *p = uprv_toupper(*p);
2191                 p++;
2192                 for (; *p; p++) {
2193                     *p = uprv_tolower(*p);
2194                 }
2195 
2196                 t->script = pSubtag;
2197 
2198                 pLastGoodPosition = pSep;
2199                 next = REGN | VART | EXTS | PRIV;
2200                 continue;
2201             }
2202         }
2203         if (next & REGN) {
2204             if (ultag_isRegionSubtag(pSubtag, subtagLen)) {
2205                 *pSep = 0;
2206                 // TODO: move deprecated region code handling here.
2207                 t->region = T_CString_toUpperCase(pSubtag);
2208 
2209                 pLastGoodPosition = pSep;
2210                 next = VART | EXTS | PRIV;
2211                 continue;
2212             }
2213         }
2214         if (next & VART) {
2215             if (_isVariantSubtag(pSubtag, subtagLen) ||
2216                (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) {
2217                 VariantListEntry *var;
2218                 UBool isAdded;
2219 
2220                 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
2221                 if (var == NULL) {
2222                     *status = U_MEMORY_ALLOCATION_ERROR;
2223                     return NULL;
2224                 }
2225                 *pSep = 0;
2226                 var->variant = T_CString_toUpperCase(pSubtag);
2227                 isAdded = _addVariantToList(&(t->variants), var);
2228                 if (!isAdded) {
2229                     /* duplicated variant entry */
2230                     uprv_free(var);
2231                     break;
2232                 }
2233                 pLastGoodPosition = pSep;
2234                 next = VART | EXTS | PRIV;
2235                 continue;
2236             }
2237         }
2238         if (next & EXTS) {
2239             if (_isExtensionSingleton(pSubtag, subtagLen)) {
2240                 if (pExtension != NULL) {
2241                     if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2242                         /* the previous extension is incomplete */
2243                         uprv_free(pExtension);
2244                         pExtension = NULL;
2245                         break;
2246                     }
2247 
2248                     /* terminate the previous extension value */
2249                     *pExtValueSubtagEnd = 0;
2250                     pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2251 
2252                     /* insert the extension to the list */
2253                     if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2254                         pLastGoodPosition = pExtValueSubtagEnd;
2255                     } else {
2256                         /* stop parsing here */
2257                         uprv_free(pExtension);
2258                         pExtension = NULL;
2259                         break;
2260                     }
2261                 }
2262 
2263                 /* create a new extension */
2264                 pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
2265                 if (pExtension == NULL) {
2266                     *status = U_MEMORY_ALLOCATION_ERROR;
2267                     return NULL;
2268                 }
2269                 *pSep = 0;
2270                 pExtension->key = T_CString_toLowerCase(pSubtag);
2271                 pExtension->value = NULL;   /* will be set later */
2272 
2273                 /*
2274                  * reset the start and the end location of extension value
2275                  * subtags for this extension
2276                  */
2277                 pExtValueSubtag = NULL;
2278                 pExtValueSubtagEnd = NULL;
2279 
2280                 next = EXTV;
2281                 continue;
2282             }
2283         }
2284         if (next & EXTV) {
2285             if (_isExtensionSubtag(pSubtag, subtagLen)) {
2286                 if (pExtValueSubtag == NULL) {
2287                     /* if the start postion of this extension's value is not yet,
2288                         this one is the first value subtag */
2289                     pExtValueSubtag = pSubtag;
2290                 }
2291 
2292                 /* Mark the end of this subtag */
2293                 pExtValueSubtagEnd = pSep;
2294                 next = EXTS | EXTV | PRIV;
2295 
2296                 continue;
2297             }
2298         }
2299         if (next & PRIV) {
2300             if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
2301                 char *pPrivuseVal;
2302 
2303                 if (pExtension != NULL) {
2304                     /* Process the last extension */
2305                     if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2306                         /* the previous extension is incomplete */
2307                         uprv_free(pExtension);
2308                         pExtension = NULL;
2309                         break;
2310                     } else {
2311                         /* terminate the previous extension value */
2312                         *pExtValueSubtagEnd = 0;
2313                         pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2314 
2315                         /* insert the extension to the list */
2316                         if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2317                             pLastGoodPosition = pExtValueSubtagEnd;
2318                             pExtension = NULL;
2319                         } else {
2320                         /* stop parsing here */
2321                             uprv_free(pExtension);
2322                             pExtension = NULL;
2323                             break;
2324                         }
2325                     }
2326                 }
2327 
2328                 /* The rest of part will be private use value subtags */
2329                 if (pNext == NULL) {
2330                     /* empty private use subtag */
2331                     break;
2332                 }
2333                 /* back up the private use value start position */
2334                 pPrivuseVal = pNext;
2335 
2336                 /* validate private use value subtags */
2337                 while (pNext) {
2338                     pSubtag = pNext;
2339                     pSep = pSubtag;
2340                     while (*pSep) {
2341                         if (*pSep == SEP) {
2342                             break;
2343                         }
2344                         pSep++;
2345                     }
2346                     if (*pSep == 0) {
2347                         /* last subtag */
2348                         pNext = NULL;
2349                     } else {
2350                         pNext = pSep + 1;
2351                     }
2352                     subtagLen = (int32_t)(pSep - pSubtag);
2353 
2354                     if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) {
2355                         *pSep = 0;
2356                         next = VART;
2357                         privateuseVar = TRUE;
2358                         break;
2359                     } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) {
2360                         pLastGoodPosition = pSep;
2361                     } else {
2362                         break;
2363                     }
2364                 }
2365 
2366                 if (next == VART) {
2367                     continue;
2368                 }
2369 
2370                 if (pLastGoodPosition - pPrivuseVal > 0) {
2371                     *pLastGoodPosition = 0;
2372                     t->privateuse = T_CString_toLowerCase(pPrivuseVal);
2373                 }
2374                 /* No more subtags, exiting the parse loop */
2375                 break;
2376             }
2377             break;
2378         }
2379 
2380         /* If we fell through here, it means this subtag is illegal - quit parsing */
2381         break;
2382     }
2383 
2384     if (pExtension != NULL) {
2385         /* Process the last extension */
2386         if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2387             /* the previous extension is incomplete */
2388             uprv_free(pExtension);
2389         } else {
2390             /* terminate the previous extension value */
2391             *pExtValueSubtagEnd = 0;
2392             pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2393             /* insert the extension to the list */
2394             if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2395                 pLastGoodPosition = pExtValueSubtagEnd;
2396             } else {
2397                 uprv_free(pExtension);
2398             }
2399         }
2400     }
2401 
2402     if (parsedLen != NULL) {
2403         *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
2404     }
2405 
2406     return t.orphan();
2407 }
2408 
2409 // Ticket #12705 - Turn optimization back on.
2410 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2411 #pragma optimize( "", on )
2412 #endif
2413 
2414 static void
ultag_close(ULanguageTag * langtag)2415 ultag_close(ULanguageTag* langtag) {
2416 
2417     if (langtag == NULL) {
2418         return;
2419     }
2420 
2421     uprv_free(langtag->buf);
2422 
2423     if (langtag->variants) {
2424         VariantListEntry *curVar = langtag->variants;
2425         while (curVar) {
2426             VariantListEntry *nextVar = curVar->next;
2427             uprv_free(curVar);
2428             curVar = nextVar;
2429         }
2430     }
2431 
2432     if (langtag->extensions) {
2433         ExtensionListEntry *curExt = langtag->extensions;
2434         while (curExt) {
2435             ExtensionListEntry *nextExt = curExt->next;
2436             uprv_free(curExt);
2437             curExt = nextExt;
2438         }
2439     }
2440 
2441     uprv_free(langtag);
2442 }
2443 
2444 static const char*
ultag_getLanguage(const ULanguageTag * langtag)2445 ultag_getLanguage(const ULanguageTag* langtag) {
2446     return langtag->language;
2447 }
2448 
2449 #if 0
2450 static const char*
2451 ultag_getJDKLanguage(const ULanguageTag* langtag) {
2452     int32_t i;
2453     for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) {
2454         if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) {
2455             return DEPRECATEDLANGS[i + 1];
2456         }
2457     }
2458     return langtag->language;
2459 }
2460 #endif
2461 
2462 static const char*
ultag_getExtlang(const ULanguageTag * langtag,int32_t idx)2463 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) {
2464     if (idx >= 0 && idx < MAXEXTLANG) {
2465         return langtag->extlang[idx];
2466     }
2467     return NULL;
2468 }
2469 
2470 static int32_t
ultag_getExtlangSize(const ULanguageTag * langtag)2471 ultag_getExtlangSize(const ULanguageTag* langtag) {
2472     int32_t size = 0;
2473     int32_t i;
2474     for (i = 0; i < MAXEXTLANG; i++) {
2475         if (langtag->extlang[i]) {
2476             size++;
2477         }
2478     }
2479     return size;
2480 }
2481 
2482 static const char*
ultag_getScript(const ULanguageTag * langtag)2483 ultag_getScript(const ULanguageTag* langtag) {
2484     return langtag->script;
2485 }
2486 
2487 static const char*
ultag_getRegion(const ULanguageTag * langtag)2488 ultag_getRegion(const ULanguageTag* langtag) {
2489     return langtag->region;
2490 }
2491 
2492 static const char*
ultag_getVariant(const ULanguageTag * langtag,int32_t idx)2493 ultag_getVariant(const ULanguageTag* langtag, int32_t idx) {
2494     const char *var = NULL;
2495     VariantListEntry *cur = langtag->variants;
2496     int32_t i = 0;
2497     while (cur) {
2498         if (i == idx) {
2499             var = cur->variant;
2500             break;
2501         }
2502         cur = cur->next;
2503         i++;
2504     }
2505     return var;
2506 }
2507 
2508 static int32_t
ultag_getVariantsSize(const ULanguageTag * langtag)2509 ultag_getVariantsSize(const ULanguageTag* langtag) {
2510     int32_t size = 0;
2511     VariantListEntry *cur = langtag->variants;
2512     while (TRUE) {
2513         if (cur == NULL) {
2514             break;
2515         }
2516         size++;
2517         cur = cur->next;
2518     }
2519     return size;
2520 }
2521 
2522 static const char*
ultag_getExtensionKey(const ULanguageTag * langtag,int32_t idx)2523 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) {
2524     const char *key = NULL;
2525     ExtensionListEntry *cur = langtag->extensions;
2526     int32_t i = 0;
2527     while (cur) {
2528         if (i == idx) {
2529             key = cur->key;
2530             break;
2531         }
2532         cur = cur->next;
2533         i++;
2534     }
2535     return key;
2536 }
2537 
2538 static const char*
ultag_getExtensionValue(const ULanguageTag * langtag,int32_t idx)2539 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) {
2540     const char *val = NULL;
2541     ExtensionListEntry *cur = langtag->extensions;
2542     int32_t i = 0;
2543     while (cur) {
2544         if (i == idx) {
2545             val = cur->value;
2546             break;
2547         }
2548         cur = cur->next;
2549         i++;
2550     }
2551     return val;
2552 }
2553 
2554 static int32_t
ultag_getExtensionsSize(const ULanguageTag * langtag)2555 ultag_getExtensionsSize(const ULanguageTag* langtag) {
2556     int32_t size = 0;
2557     ExtensionListEntry *cur = langtag->extensions;
2558     while (TRUE) {
2559         if (cur == NULL) {
2560             break;
2561         }
2562         size++;
2563         cur = cur->next;
2564     }
2565     return size;
2566 }
2567 
2568 static const char*
ultag_getPrivateUse(const ULanguageTag * langtag)2569 ultag_getPrivateUse(const ULanguageTag* langtag) {
2570     return langtag->privateuse;
2571 }
2572 
2573 #if 0
2574 static const char*
2575 ultag_getLegacy(const ULanguageTag* langtag) {
2576     return langtag->legacy;
2577 }
2578 #endif
2579 
2580 
2581 /*
2582 * -------------------------------------------------
2583 *
2584 * Locale/BCP47 conversion APIs, exposed as uloc_*
2585 *
2586 * -------------------------------------------------
2587 */
2588 U_CAPI int32_t U_EXPORT2
uloc_toLanguageTag(const char * localeID,char * langtag,int32_t langtagCapacity,UBool strict,UErrorCode * status)2589 uloc_toLanguageTag(const char* localeID,
2590                    char* langtag,
2591                    int32_t langtagCapacity,
2592                    UBool strict,
2593                    UErrorCode* status) {
2594     if (U_FAILURE(*status)) {
2595         return 0;
2596     }
2597 
2598     icu::CheckedArrayByteSink sink(langtag, langtagCapacity);
2599     ulocimp_toLanguageTag(localeID, sink, strict, status);
2600 
2601     int32_t reslen = sink.NumberOfBytesAppended();
2602 
2603     if (U_FAILURE(*status)) {
2604         return reslen;
2605     }
2606 
2607     if (sink.Overflowed()) {
2608         *status = U_BUFFER_OVERFLOW_ERROR;
2609     } else {
2610         u_terminateChars(langtag, langtagCapacity, reslen, status);
2611     }
2612 
2613     return reslen;
2614 }
2615 
2616 
2617 U_CAPI void U_EXPORT2
ulocimp_toLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)2618 ulocimp_toLanguageTag(const char* localeID,
2619                       icu::ByteSink& sink,
2620                       UBool strict,
2621                       UErrorCode* status) {
2622     icu::CharString canonical;
2623     int32_t reslen;
2624     UErrorCode tmpStatus = U_ZERO_ERROR;
2625     UBool hadPosix = FALSE;
2626     const char* pKeywordStart;
2627 
2628     /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "".  See #6835 */
2629     int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
2630     if (resultCapacity > 0) {
2631         char* buffer;
2632 
2633         for (;;) {
2634             buffer = canonical.getAppendBuffer(
2635                     /*minCapacity=*/resultCapacity,
2636                     /*desiredCapacityHint=*/resultCapacity,
2637                     resultCapacity,
2638                     tmpStatus);
2639 
2640             if (U_FAILURE(tmpStatus)) {
2641                 *status = tmpStatus;
2642                 return;
2643             }
2644 
2645             reslen =
2646                 uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
2647 
2648             if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
2649                 break;
2650             }
2651 
2652             resultCapacity = reslen;
2653             tmpStatus = U_ZERO_ERROR;
2654         }
2655 
2656         if (U_FAILURE(tmpStatus)) {
2657             *status = U_ILLEGAL_ARGUMENT_ERROR;
2658             return;
2659         }
2660 
2661         canonical.append(buffer, reslen, tmpStatus);
2662         if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
2663             tmpStatus = U_ZERO_ERROR;  // Terminators provided by CharString.
2664         }
2665 
2666         if (U_FAILURE(tmpStatus)) {
2667             *status = tmpStatus;
2668             return;
2669         }
2670     }
2671 
2672     /* For handling special case - private use only tag */
2673     pKeywordStart = locale_getKeywordsStart(canonical.data());
2674     if (pKeywordStart == canonical.data()) {
2675         int kwdCnt = 0;
2676         UBool done = FALSE;
2677 
2678         icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus));
2679         if (U_SUCCESS(tmpStatus)) {
2680             kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus);
2681             if (kwdCnt == 1) {
2682                 const char *key;
2683                 int32_t len = 0;
2684 
2685                 key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus);
2686                 if (len == 1 && *key == PRIVATEUSE) {
2687                     icu::CharString buf;
2688                     {
2689                         icu::CharStringByteSink sink(&buf);
2690                         ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
2691                     }
2692                     if (U_SUCCESS(tmpStatus)) {
2693                         if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) {
2694                             /* return private use only tag */
2695                             static const char PREFIX[] = { PRIVATEUSE, SEP };
2696                             sink.Append(PREFIX, sizeof(PREFIX));
2697                             sink.Append(buf.data(), buf.length());
2698                             done = TRUE;
2699                         } else if (strict) {
2700                             *status = U_ILLEGAL_ARGUMENT_ERROR;
2701                             done = TRUE;
2702                         }
2703                         /* if not strict mode, then "und" will be returned */
2704                     } else {
2705                         *status = U_ILLEGAL_ARGUMENT_ERROR;
2706                         done = TRUE;
2707                     }
2708                 }
2709             }
2710             if (done) {
2711                 return;
2712             }
2713         }
2714     }
2715 
2716     _appendLanguageToLanguageTag(canonical.data(), sink, strict, status);
2717     _appendScriptToLanguageTag(canonical.data(), sink, strict, status);
2718     _appendRegionToLanguageTag(canonical.data(), sink, strict, status);
2719     _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status);
2720     _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2721     _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2722 }
2723 
2724 
2725 U_CAPI int32_t U_EXPORT2
uloc_forLanguageTag(const char * langtag,char * localeID,int32_t localeIDCapacity,int32_t * parsedLength,UErrorCode * status)2726 uloc_forLanguageTag(const char* langtag,
2727                     char* localeID,
2728                     int32_t localeIDCapacity,
2729                     int32_t* parsedLength,
2730                     UErrorCode* status) {
2731     if (U_FAILURE(*status)) {
2732         return 0;
2733     }
2734 
2735     icu::CheckedArrayByteSink sink(localeID, localeIDCapacity);
2736     ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status);
2737 
2738     int32_t reslen = sink.NumberOfBytesAppended();
2739 
2740     if (U_FAILURE(*status)) {
2741         return reslen;
2742     }
2743 
2744     if (sink.Overflowed()) {
2745         *status = U_BUFFER_OVERFLOW_ERROR;
2746     } else {
2747         u_terminateChars(localeID, localeIDCapacity, reslen, status);
2748     }
2749 
2750     return reslen;
2751 }
2752 
2753 
2754 U_CAPI void U_EXPORT2
ulocimp_forLanguageTag(const char * langtag,int32_t tagLen,icu::ByteSink & sink,int32_t * parsedLength,UErrorCode * status)2755 ulocimp_forLanguageTag(const char* langtag,
2756                        int32_t tagLen,
2757                        icu::ByteSink& sink,
2758                        int32_t* parsedLength,
2759                        UErrorCode* status) {
2760     UBool isEmpty = TRUE;
2761     const char *subtag, *p;
2762     int32_t len;
2763     int32_t i, n;
2764     UBool noRegion = TRUE;
2765 
2766     icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status));
2767     if (U_FAILURE(*status)) {
2768         return;
2769     }
2770 
2771     /* language */
2772     subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias());
2773     if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) {
2774         len = (int32_t)uprv_strlen(subtag);
2775         if (len > 0) {
2776             sink.Append(subtag, len);
2777             isEmpty = FALSE;
2778         }
2779     }
2780 
2781     /* script */
2782     subtag = ultag_getScript(lt.getAlias());
2783     len = (int32_t)uprv_strlen(subtag);
2784     if (len > 0) {
2785         sink.Append("_", 1);
2786         isEmpty = FALSE;
2787 
2788         /* write out the script in title case */
2789         char c = uprv_toupper(*subtag);
2790         sink.Append(&c, 1);
2791         sink.Append(subtag + 1, len - 1);
2792     }
2793 
2794     /* region */
2795     subtag = ultag_getRegion(lt.getAlias());
2796     len = (int32_t)uprv_strlen(subtag);
2797     if (len > 0) {
2798         sink.Append("_", 1);
2799         isEmpty = FALSE;
2800 
2801         /* write out the region in upper case */
2802         p = subtag;
2803         while (*p) {
2804             char c = uprv_toupper(*p);
2805             sink.Append(&c, 1);
2806             p++;
2807         }
2808         noRegion = FALSE;
2809     }
2810 
2811     /* variants */
2812     _sortVariants(lt.getAlias()->variants);
2813     n = ultag_getVariantsSize(lt.getAlias());
2814     if (n > 0) {
2815         if (noRegion) {
2816             sink.Append("_", 1);
2817             isEmpty = FALSE;
2818         }
2819 
2820         for (i = 0; i < n; i++) {
2821             subtag = ultag_getVariant(lt.getAlias(), i);
2822             sink.Append("_", 1);
2823 
2824             /* write out the variant in upper case */
2825             p = subtag;
2826             while (*p) {
2827                 char c = uprv_toupper(*p);
2828                 sink.Append(&c, 1);
2829                 p++;
2830             }
2831         }
2832     }
2833 
2834     /* keywords */
2835     n = ultag_getExtensionsSize(lt.getAlias());
2836     subtag = ultag_getPrivateUse(lt.getAlias());
2837     if (n > 0 || uprv_strlen(subtag) > 0) {
2838         if (isEmpty && n > 0) {
2839             /* need a language */
2840             sink.Append(LANG_UND, LANG_UND_LEN);
2841         }
2842         _appendKeywords(lt.getAlias(), sink, status);
2843     }
2844 }
2845