• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1997-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  loclikely.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2010feb25
16 *   created by: Markus W. Scherer
17 *
18 *   Code for likely and minimized locale subtags, separated out from other .cpp files
19 *   that then do not depend on resource bundle code and likely-subtags data.
20 */
21 
22 #include <string_view>
23 #include <utility>
24 
25 #include "unicode/bytestream.h"
26 #include "unicode/utypes.h"
27 #include "unicode/locid.h"
28 #include "unicode/putil.h"
29 #include "unicode/uchar.h"
30 #include "unicode/uloc.h"
31 #include "unicode/ures.h"
32 #include "unicode/uscript.h"
33 #include "bytesinkutil.h"
34 #include "charstr.h"
35 #include "cmemory.h"
36 #include "cstring.h"
37 #include "loclikelysubtags.h"
38 #include "ulocimp.h"
39 
40 namespace {
41 
42 /**
43  * Create a tag string from the supplied parameters.  The lang, script and region
44  * parameters may be nullptr pointers. If they are, their corresponding length parameters
45  * must be less than or equal to 0.
46  *
47  * If an illegal argument is provided, the function returns the error
48  * U_ILLEGAL_ARGUMENT_ERROR.
49  *
50  * @param lang The language tag to use.
51  * @param langLength The length of the language tag.
52  * @param script The script tag to use.
53  * @param scriptLength The length of the script tag.
54  * @param region The region tag to use.
55  * @param regionLength The length of the region tag.
56  * @param variant The region tag to use.
57  * @param variantLength The length of the region tag.
58  * @param trailing Any trailing data to append to the new tag.
59  * @param trailingLength The length of the trailing data.
60  * @param sink The output sink receiving the tag string.
61  * @param err A pointer to a UErrorCode for error reporting.
62  **/
63 void U_CALLCONV
createTagStringWithAlternates(const char * lang,int32_t langLength,const char * script,int32_t scriptLength,const char * region,int32_t regionLength,const char * variant,int32_t variantLength,const char * trailing,int32_t trailingLength,icu::ByteSink & sink,UErrorCode & err)64 createTagStringWithAlternates(
65     const char* lang,
66     int32_t langLength,
67     const char* script,
68     int32_t scriptLength,
69     const char* region,
70     int32_t regionLength,
71     const char* variant,
72     int32_t variantLength,
73     const char* trailing,
74     int32_t trailingLength,
75     icu::ByteSink& sink,
76     UErrorCode& err) {
77     if (U_FAILURE(err)) {
78         return;
79     }
80 
81     if (langLength >= ULOC_LANG_CAPACITY ||
82             scriptLength >= ULOC_SCRIPT_CAPACITY ||
83             regionLength >= ULOC_COUNTRY_CAPACITY) {
84         err = U_ILLEGAL_ARGUMENT_ERROR;
85         return;
86     }
87 
88     if (langLength > 0) {
89         sink.Append(lang, langLength);
90     }
91 
92     if (scriptLength > 0) {
93         sink.Append("_", 1);
94         sink.Append(script, scriptLength);
95     }
96 
97     if (regionLength > 0) {
98         sink.Append("_", 1);
99         sink.Append(region, regionLength);
100     }
101 
102     if (variantLength > 0) {
103         if (regionLength == 0) {
104             /* extra separator is required */
105             sink.Append("_", 1);
106         }
107         sink.Append("_", 1);
108         sink.Append(variant, variantLength);
109     }
110 
111     if (trailingLength > 0) {
112         /*
113          * Copy the trailing data into the supplied buffer.
114          */
115         sink.Append(trailing, trailingLength);
116     }
117 }
118 
CHECK_TRAILING_VARIANT_SIZE(const char * variant,int32_t variantLength)119 bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
120     int32_t count = 0;
121     for (int32_t i = 0; i < variantLength; i++) {
122         if (_isIDSeparator(variant[i])) {
123             count = 0;
124         } else if (count == 8) {
125             return false;
126         } else {
127             count++;
128         }
129     }
130     return true;
131 }
132 
133 void
_uloc_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode & err)134 _uloc_addLikelySubtags(const char* localeID,
135                        icu::ByteSink& sink,
136                        UErrorCode& err) {
137     if (U_FAILURE(err)) {
138         return;
139     }
140 
141     if (localeID == nullptr) {
142         err = U_ILLEGAL_ARGUMENT_ERROR;
143         return;
144     }
145 
146     icu::CharString lang;
147     icu::CharString script;
148     icu::CharString region;
149     icu::CharString variant;
150     const char* trailing = nullptr;
151     ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
152     if (U_FAILURE(err)) {
153         return;
154     }
155 
156     if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
157         err = U_ILLEGAL_ARGUMENT_ERROR;
158         return;
159     }
160 
161     if (lang.length() == 4) {
162         if (script.isEmpty()) {
163             script = std::move(lang);
164             lang.clear();
165         } else {
166             err = U_ILLEGAL_ARGUMENT_ERROR;
167             return;
168         }
169     } else if (lang.length() > 8) {
170         err = U_ILLEGAL_ARGUMENT_ERROR;
171         return;
172     }
173 
174     int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
175 
176     const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
177     if (U_FAILURE(err)) {
178         return;
179     }
180     // We need to keep l on the stack because lsr may point into internal
181     // memory of l.
182     icu::Locale l = icu::Locale::createFromName(localeID);
183     if (l.isBogus()) {
184         err = U_ILLEGAL_ARGUMENT_ERROR;
185         return;
186     }
187     icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
188     if (U_FAILURE(err)) {
189         return;
190     }
191     const char* language = lsr.language;
192     if (uprv_strcmp(language, "und") == 0) {
193         language = "";
194     }
195     createTagStringWithAlternates(
196         language,
197         static_cast<int32_t>(uprv_strlen(language)),
198         lsr.script,
199         static_cast<int32_t>(uprv_strlen(lsr.script)),
200         lsr.region,
201         static_cast<int32_t>(uprv_strlen(lsr.region)),
202         variant.data(),
203         variant.length(),
204         trailing,
205         trailingLength,
206         sink,
207         err);
208 }
209 
210 void
_uloc_minimizeSubtags(const char * localeID,icu::ByteSink & sink,bool favorScript,UErrorCode & err)211 _uloc_minimizeSubtags(const char* localeID,
212                       icu::ByteSink& sink,
213                       bool favorScript,
214                       UErrorCode& err) {
215     if (U_FAILURE(err)) {
216         return;
217     }
218 
219     if (localeID == nullptr) {
220         err = U_ILLEGAL_ARGUMENT_ERROR;
221         return;
222     }
223 
224     icu::CharString lang;
225     icu::CharString script;
226     icu::CharString region;
227     icu::CharString variant;
228     const char* trailing = nullptr;
229     ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
230     if (U_FAILURE(err)) {
231         return;
232     }
233 
234     if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
235         err = U_ILLEGAL_ARGUMENT_ERROR;
236         return;
237     }
238 
239     int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
240 
241     const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
242     if (U_FAILURE(err)) {
243         return;
244     }
245     icu::LSR lsr = likelySubtags->minimizeSubtags(
246         lang.toStringPiece(),
247         script.toStringPiece(),
248         region.toStringPiece(),
249         favorScript,
250         err);
251     if (U_FAILURE(err)) {
252         return;
253     }
254     const char* language = lsr.language;
255     if (uprv_strcmp(language, "und") == 0) {
256         language = "";
257     }
258     createTagStringWithAlternates(
259         language,
260         static_cast<int32_t>(uprv_strlen(language)),
261         lsr.script,
262         static_cast<int32_t>(uprv_strlen(lsr.script)),
263         lsr.region,
264         static_cast<int32_t>(uprv_strlen(lsr.region)),
265         variant.data(),
266         variant.length(),
267         trailing,
268         trailingLength,
269         sink,
270         err);
271 }
272 
273 }  // namespace
274 
275 U_CAPI int32_t U_EXPORT2
uloc_addLikelySubtags(const char * localeID,char * maximizedLocaleID,int32_t maximizedLocaleIDCapacity,UErrorCode * status)276 uloc_addLikelySubtags(const char* localeID,
277                       char* maximizedLocaleID,
278                       int32_t maximizedLocaleIDCapacity,
279                       UErrorCode* status) {
280     return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
281         maximizedLocaleID, maximizedLocaleIDCapacity,
282         [&](icu::ByteSink& sink, UErrorCode& status) {
283             ulocimp_addLikelySubtags(localeID, sink, status);
284         },
285         *status);
286 }
287 
288 U_EXPORT icu::CharString
ulocimp_addLikelySubtags(const char * localeID,UErrorCode & status)289 ulocimp_addLikelySubtags(const char* localeID,
290                          UErrorCode& status) {
291     return icu::ByteSinkUtil::viaByteSinkToCharString(
292         [&](icu::ByteSink& sink, UErrorCode& status) {
293             ulocimp_addLikelySubtags(localeID, sink, status);
294         },
295         status);
296 }
297 
298 U_EXPORT void
ulocimp_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode & status)299 ulocimp_addLikelySubtags(const char* localeID,
300                          icu::ByteSink& sink,
301                          UErrorCode& status) {
302     if (U_FAILURE(status)) { return; }
303     icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
304     _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
305 }
306 
307 U_CAPI int32_t U_EXPORT2
uloc_minimizeSubtags(const char * localeID,char * minimizedLocaleID,int32_t minimizedLocaleIDCapacity,UErrorCode * status)308 uloc_minimizeSubtags(const char* localeID,
309                      char* minimizedLocaleID,
310                      int32_t minimizedLocaleIDCapacity,
311                      UErrorCode* status) {
312     return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
313         minimizedLocaleID, minimizedLocaleIDCapacity,
314         [&](icu::ByteSink& sink, UErrorCode& status) {
315             ulocimp_minimizeSubtags(localeID, sink, false, status);
316         },
317         *status);
318 }
319 
320 U_EXPORT icu::CharString
ulocimp_minimizeSubtags(const char * localeID,bool favorScript,UErrorCode & status)321 ulocimp_minimizeSubtags(const char* localeID,
322                         bool favorScript,
323                         UErrorCode& status) {
324     return icu::ByteSinkUtil::viaByteSinkToCharString(
325         [&](icu::ByteSink& sink, UErrorCode& status) {
326             ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
327         },
328         status);
329 }
330 
331 U_EXPORT void
ulocimp_minimizeSubtags(const char * localeID,icu::ByteSink & sink,bool favorScript,UErrorCode & status)332 ulocimp_minimizeSubtags(const char* localeID,
333                         icu::ByteSink& sink,
334                         bool favorScript,
335                         UErrorCode& status) {
336     if (U_FAILURE(status)) { return; }
337     icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
338     _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
339 }
340 
341 // Pairs of (language subtag, + or -) for finding out fast if common languages
342 // are LTR (minus) or RTL (plus).
343 static const char LANG_DIR_STRING[] =
344         "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
345 
346 // Implemented here because this calls ulocimp_addLikelySubtags().
347 U_CAPI UBool U_EXPORT2
uloc_isRightToLeft(const char * locale)348 uloc_isRightToLeft(const char *locale) {
349     UErrorCode errorCode = U_ZERO_ERROR;
350     icu::CharString lang;
351     icu::CharString script;
352     ulocimp_getSubtags(locale, &lang, &script, nullptr, nullptr, nullptr, errorCode);
353     if (U_FAILURE(errorCode) || script.isEmpty()) {
354         // Fastpath: We know the likely scripts and their writing direction
355         // for some common languages.
356         if (!lang.isEmpty()) {
357             const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
358             if (langPtr != nullptr) {
359                 switch (langPtr[lang.length()]) {
360                 case '-': return false;
361                 case '+': return true;
362                 default: break;  // partial match of a longer code
363                 }
364             }
365         }
366         // Otherwise, find the likely script.
367         errorCode = U_ZERO_ERROR;
368         icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
369         if (U_FAILURE(errorCode)) {
370             return false;
371         }
372         ulocimp_getSubtags(likely.data(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
373         if (U_FAILURE(errorCode) || script.isEmpty()) {
374             return false;
375         }
376     }
377     UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
378     return uscript_isRightToLeft(scriptCode);
379 }
380 
381 U_NAMESPACE_BEGIN
382 
383 UBool
isRightToLeft() const384 Locale::isRightToLeft() const {
385     return uloc_isRightToLeft(getBaseName());
386 }
387 
388 U_NAMESPACE_END
389 
390 namespace {
391 icu::CharString
GetRegionFromKey(const char * localeID,std::string_view key,UErrorCode & status)392 GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
393     icu::CharString result;
394     // First check for keyword value
395     icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
396     int32_t len = kw.length();
397     // In UTS35
398     //   type = alphanum{3,8} (sep alphanum{3,8})* ;
399     // so we know the subdivision must fit the type already.
400     //
401     //   unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
402     //   unicode_region_subtag = (alpha{2} | digit{3}) ;
403     //   unicode_subdivision_suffix = alphanum{1,4} ;
404     // But we also know there are no id in start with digit{3} in
405     // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
406     // Therefore we can simplify as
407     // unicode_subdivision_id = alpha{2} alphanum{1,4}
408     //
409     // and only need to accept/reject the code based on the alpha{2} and the length.
410     if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
411         uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
412         // Additional Check
413         static icu::RegionValidateMap valid;
414         const char region[] = {kw[0], kw[1], '\0'};
415         if (valid.isSet(region)) {
416             result.append(uprv_toupper(kw[0]), status);
417             result.append(uprv_toupper(kw[1]), status);
418         }
419     }
420     return result;
421 }
422 }  // namespace
423 
424 U_EXPORT icu::CharString
ulocimp_getRegionForSupplementalData(const char * localeID,bool inferRegion,UErrorCode & status)425 ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
426                                      UErrorCode& status) {
427     if (U_FAILURE(status)) {
428         return {};
429     }
430     icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
431     if (U_SUCCESS(status) && rgBuf.isEmpty()) {
432         // No valid rg keyword value, try for unicode_region_subtag
433         rgBuf = ulocimp_getRegion(localeID, status);
434         if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
435             // Second check for sd keyword value
436             rgBuf = GetRegionFromKey(localeID, "sd", status);
437             if (U_SUCCESS(status) && rgBuf.isEmpty()) {
438                 // no unicode_region_subtag but inferRegion true, try likely subtags
439                 UErrorCode rgStatus = U_ZERO_ERROR;
440                 icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
441                 if (U_SUCCESS(rgStatus)) {
442                     rgBuf = ulocimp_getRegion(locBuf.data(), status);
443                 }
444             }
445         }
446     }
447 
448     return rgBuf;
449 }
450 
451 namespace {
452 
453 // The following data is generated by unit test code inside
454 // test/intltest/regiontst.cpp from the resource data while
455 // the test failed.
456 const uint32_t gValidRegionMap[] = {
457     0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580,
458     0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f,
459     0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b,
460     0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7,
461     0x0410419a, 0x00408557, 0x00004002, 0x00100001,
462     0x00400408, 0x00000001,
463 };
464 
465 }  // namespace
466    //
467 U_NAMESPACE_BEGIN
RegionValidateMap()468 RegionValidateMap::RegionValidateMap() {
469     uprv_memcpy(map, gValidRegionMap, sizeof(map));
470 }
471 
~RegionValidateMap()472 RegionValidateMap::~RegionValidateMap() {
473 }
474 
isSet(const char * region) const475 bool RegionValidateMap::isSet(const char* region) const {
476     int32_t index = value(region);
477     if (index < 0) {
478         return false;
479     }
480     return 0 != (map[index / 32] & (1L << (index % 32)));
481 }
482 
equals(const RegionValidateMap & that) const483 bool RegionValidateMap::equals(const RegionValidateMap& that) const {
484     return uprv_memcmp(map, that.map, sizeof(map)) == 0;
485 }
486 
487 // The code transform two letter a-z to a integer valued between -1, 26x26.
488 // -1 indicate the region is outside the range of two letter a-z
489 // the rest of value is between 0 and 676 (= 26x26) and used as an index
490 // the the bigmap in map. The map is an array of 22 int32_t.
491 // since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
value(const char * region) const492 int32_t RegionValidateMap::value(const char* region) const {
493     if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
494         region[2] == '\0') {
495         return (uprv_toupper(region[0])-'A') * 26 +
496                (uprv_toupper(region[1])-'A');
497     }
498     return -1;
499 }
500 
501 U_NAMESPACE_END
502