• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1997-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  loclikely.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2010feb25
16 *   created by: Markus W. Scherer
17 *
18 *   Code for likely and minimized locale subtags, separated out from other .cpp files
19 *   that then do not depend on resource bundle code and likely-subtags data.
20 */
21 
22 #include <utility>
23 
24 #include "unicode/bytestream.h"
25 #include "unicode/utypes.h"
26 #include "unicode/locid.h"
27 #include "unicode/putil.h"
28 #include "unicode/uchar.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ures.h"
31 #include "unicode/uscript.h"
32 #include "bytesinkutil.h"
33 #include "charstr.h"
34 #include "cmemory.h"
35 #include "cstring.h"
36 #include "loclikelysubtags.h"
37 #include "ulocimp.h"
38 
39 namespace {
40 
41 /**
42  * Create a tag string from the supplied parameters.  The lang, script and region
43  * parameters may be nullptr pointers. If they are, their corresponding length parameters
44  * must be less than or equal to 0.
45  *
46  * If an illegal argument is provided, the function returns the error
47  * U_ILLEGAL_ARGUMENT_ERROR.
48  *
49  * @param lang The language tag to use.
50  * @param langLength The length of the language tag.
51  * @param script The script tag to use.
52  * @param scriptLength The length of the script tag.
53  * @param region The region tag to use.
54  * @param regionLength The length of the region tag.
55  * @param variant The region tag to use.
56  * @param variantLength The length of the region tag.
57  * @param trailing Any trailing data to append to the new tag.
58  * @param trailingLength The length of the trailing data.
59  * @param sink The output sink receiving the tag string.
60  * @param err A pointer to a UErrorCode for error reporting.
61  **/
62 void U_CALLCONV
createTagStringWithAlternates(const char * lang,int32_t langLength,const char * script,int32_t scriptLength,const char * region,int32_t regionLength,const char * variant,int32_t variantLength,const char * trailing,int32_t trailingLength,icu::ByteSink & sink,UErrorCode & err)63 createTagStringWithAlternates(
64     const char* lang,
65     int32_t langLength,
66     const char* script,
67     int32_t scriptLength,
68     const char* region,
69     int32_t regionLength,
70     const char* variant,
71     int32_t variantLength,
72     const char* trailing,
73     int32_t trailingLength,
74     icu::ByteSink& sink,
75     UErrorCode& err) {
76     if (U_FAILURE(err)) {
77         return;
78     }
79 
80     if (langLength >= ULOC_LANG_CAPACITY ||
81             scriptLength >= ULOC_SCRIPT_CAPACITY ||
82             regionLength >= ULOC_COUNTRY_CAPACITY) {
83         err = U_ILLEGAL_ARGUMENT_ERROR;
84         return;
85     }
86 
87     if (langLength > 0) {
88         sink.Append(lang, langLength);
89     }
90 
91     if (scriptLength > 0) {
92         sink.Append("_", 1);
93         sink.Append(script, scriptLength);
94     }
95 
96     if (regionLength > 0) {
97         sink.Append("_", 1);
98         sink.Append(region, regionLength);
99     }
100 
101     if (variantLength > 0) {
102         if (regionLength == 0) {
103             /* extra separator is required */
104             sink.Append("_", 1);
105         }
106         sink.Append("_", 1);
107         sink.Append(variant, variantLength);
108     }
109 
110     if (trailingLength > 0) {
111         /*
112          * Copy the trailing data into the supplied buffer.
113          */
114         sink.Append(trailing, trailingLength);
115     }
116 }
117 
CHECK_TRAILING_VARIANT_SIZE(const char * variant,int32_t variantLength)118 bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
119     int32_t count = 0;
120     for (int32_t i = 0; i < variantLength; i++) {
121         if (_isIDSeparator(variant[i])) {
122             count = 0;
123         } else if (count == 8) {
124             return false;
125         } else {
126             count++;
127         }
128     }
129     return true;
130 }
131 
132 void
_uloc_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode & err)133 _uloc_addLikelySubtags(const char* localeID,
134                        icu::ByteSink& sink,
135                        UErrorCode& err) {
136     if (U_FAILURE(err)) {
137         return;
138     }
139 
140     if (localeID == nullptr) {
141         err = U_ILLEGAL_ARGUMENT_ERROR;
142         return;
143     }
144 
145     icu::CharString lang;
146     icu::CharString script;
147     icu::CharString region;
148     icu::CharString variant;
149     const char* trailing = nullptr;
150     ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
151     if (U_FAILURE(err)) {
152         return;
153     }
154 
155     if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
156         err = U_ILLEGAL_ARGUMENT_ERROR;
157         return;
158     }
159 
160     if (lang.length() == 4) {
161         if (script.isEmpty()) {
162             script = std::move(lang);
163             lang.clear();
164         } else {
165             err = U_ILLEGAL_ARGUMENT_ERROR;
166             return;
167         }
168     } else if (lang.length() > 8) {
169         err = U_ILLEGAL_ARGUMENT_ERROR;
170         return;
171     }
172 
173     int32_t trailingLength = (int32_t)uprv_strlen(trailing);
174 
175     const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
176     if (U_FAILURE(err)) {
177         return;
178     }
179     // We need to keep l on the stack because lsr may point into internal
180     // memory of l.
181     icu::Locale l = icu::Locale::createFromName(localeID);
182     if (l.isBogus()) {
183         err = U_ILLEGAL_ARGUMENT_ERROR;
184         return;
185     }
186     icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
187     if (U_FAILURE(err)) {
188         return;
189     }
190     const char* language = lsr.language;
191     if (uprv_strcmp(language, "und") == 0) {
192         language = "";
193     }
194     createTagStringWithAlternates(
195         language,
196         (int32_t)uprv_strlen(language),
197         lsr.script,
198         (int32_t)uprv_strlen(lsr.script),
199         lsr.region,
200         (int32_t)uprv_strlen(lsr.region),
201         variant.data(),
202         variant.length(),
203         trailing,
204         trailingLength,
205         sink,
206         err);
207 }
208 
209 void
_uloc_minimizeSubtags(const char * localeID,icu::ByteSink & sink,bool favorScript,UErrorCode & err)210 _uloc_minimizeSubtags(const char* localeID,
211                       icu::ByteSink& sink,
212                       bool favorScript,
213                       UErrorCode& err) {
214     if (U_FAILURE(err)) {
215         return;
216     }
217 
218     if (localeID == nullptr) {
219         err = U_ILLEGAL_ARGUMENT_ERROR;
220         return;
221     }
222 
223     icu::CharString lang;
224     icu::CharString script;
225     icu::CharString region;
226     icu::CharString variant;
227     const char* trailing = nullptr;
228     ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
229     if (U_FAILURE(err)) {
230         return;
231     }
232 
233     if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
234         err = U_ILLEGAL_ARGUMENT_ERROR;
235         return;
236     }
237 
238     int32_t trailingLength = (int32_t)uprv_strlen(trailing);
239 
240     const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
241     if (U_FAILURE(err)) {
242         return;
243     }
244     icu::LSR lsr = likelySubtags->minimizeSubtags(
245         lang.toStringPiece(),
246         script.toStringPiece(),
247         region.toStringPiece(),
248         favorScript,
249         err);
250     if (U_FAILURE(err)) {
251         return;
252     }
253     const char* language = lsr.language;
254     if (uprv_strcmp(language, "und") == 0) {
255         language = "";
256     }
257     createTagStringWithAlternates(
258         language,
259         (int32_t)uprv_strlen(language),
260         lsr.script,
261         (int32_t)uprv_strlen(lsr.script),
262         lsr.region,
263         (int32_t)uprv_strlen(lsr.region),
264         variant.data(),
265         variant.length(),
266         trailing,
267         trailingLength,
268         sink,
269         err);
270 }
271 
272 }  // namespace
273 
274 U_CAPI int32_t U_EXPORT2
uloc_addLikelySubtags(const char * localeID,char * maximizedLocaleID,int32_t maximizedLocaleIDCapacity,UErrorCode * status)275 uloc_addLikelySubtags(const char* localeID,
276                       char* maximizedLocaleID,
277                       int32_t maximizedLocaleIDCapacity,
278                       UErrorCode* status) {
279     return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
280         maximizedLocaleID, maximizedLocaleIDCapacity,
281         [&](icu::ByteSink& sink, UErrorCode& status) {
282             ulocimp_addLikelySubtags(localeID, sink, status);
283         },
284         *status);
285 }
286 
287 U_EXPORT icu::CharString
ulocimp_addLikelySubtags(const char * localeID,UErrorCode & status)288 ulocimp_addLikelySubtags(const char* localeID,
289                          UErrorCode& status) {
290     return icu::ByteSinkUtil::viaByteSinkToCharString(
291         [&](icu::ByteSink& sink, UErrorCode& status) {
292             ulocimp_addLikelySubtags(localeID, sink, status);
293         },
294         status);
295 }
296 
297 U_EXPORT void
ulocimp_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode & status)298 ulocimp_addLikelySubtags(const char* localeID,
299                          icu::ByteSink& sink,
300                          UErrorCode& status) {
301     if (U_FAILURE(status)) { return; }
302     icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
303     _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
304 }
305 
306 U_CAPI int32_t U_EXPORT2
uloc_minimizeSubtags(const char * localeID,char * minimizedLocaleID,int32_t minimizedLocaleIDCapacity,UErrorCode * status)307 uloc_minimizeSubtags(const char* localeID,
308                      char* minimizedLocaleID,
309                      int32_t minimizedLocaleIDCapacity,
310                      UErrorCode* status) {
311     return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
312         minimizedLocaleID, minimizedLocaleIDCapacity,
313         [&](icu::ByteSink& sink, UErrorCode& status) {
314             ulocimp_minimizeSubtags(localeID, sink, false, status);
315         },
316         *status);
317 }
318 
319 U_EXPORT icu::CharString
ulocimp_minimizeSubtags(const char * localeID,bool favorScript,UErrorCode & status)320 ulocimp_minimizeSubtags(const char* localeID,
321                         bool favorScript,
322                         UErrorCode& status) {
323     return icu::ByteSinkUtil::viaByteSinkToCharString(
324         [&](icu::ByteSink& sink, UErrorCode& status) {
325             ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
326         },
327         status);
328 }
329 
330 U_EXPORT void
ulocimp_minimizeSubtags(const char * localeID,icu::ByteSink & sink,bool favorScript,UErrorCode & status)331 ulocimp_minimizeSubtags(const char* localeID,
332                         icu::ByteSink& sink,
333                         bool favorScript,
334                         UErrorCode& status) {
335     if (U_FAILURE(status)) { return; }
336     icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
337     _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
338 }
339 
340 // Pairs of (language subtag, + or -) for finding out fast if common languages
341 // are LTR (minus) or RTL (plus).
342 static const char LANG_DIR_STRING[] =
343         "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
344 
345 // Implemented here because this calls ulocimp_addLikelySubtags().
346 U_CAPI UBool U_EXPORT2
uloc_isRightToLeft(const char * locale)347 uloc_isRightToLeft(const char *locale) {
348     UErrorCode errorCode = U_ZERO_ERROR;
349     icu::CharString lang;
350     icu::CharString script;
351     ulocimp_getSubtags(locale, &lang, &script, nullptr, nullptr, nullptr, errorCode);
352     if (U_FAILURE(errorCode) || script.isEmpty()) {
353         // Fastpath: We know the likely scripts and their writing direction
354         // for some common languages.
355         if (!lang.isEmpty()) {
356             const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
357             if (langPtr != nullptr) {
358                 switch (langPtr[lang.length()]) {
359                 case '-': return false;
360                 case '+': return true;
361                 default: break;  // partial match of a longer code
362                 }
363             }
364         }
365         // Otherwise, find the likely script.
366         errorCode = U_ZERO_ERROR;
367         icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
368         if (U_FAILURE(errorCode)) {
369             return false;
370         }
371         ulocimp_getSubtags(likely.data(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
372         if (U_FAILURE(errorCode) || script.isEmpty()) {
373             return false;
374         }
375     }
376     UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
377     return uscript_isRightToLeft(scriptCode);
378 }
379 
380 U_NAMESPACE_BEGIN
381 
382 UBool
isRightToLeft() const383 Locale::isRightToLeft() const {
384     return uloc_isRightToLeft(getBaseName());
385 }
386 
387 U_NAMESPACE_END
388 
389 namespace {
390 icu::CharString
GetRegionFromKey(const char * localeID,const char * key,UErrorCode & status)391 GetRegionFromKey(const char* localeID, const char* key, UErrorCode& status) {
392     icu::CharString result;
393 
394     // First check for keyword value
395     icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
396     int32_t len = kw.length();
397     if (U_SUCCESS(status) && len >= 3 && len <= 7) {
398         // chop off the subdivision code (which will generally be "zzzz" anyway)
399         const char* const data = kw.data();
400         if (uprv_isASCIILetter(data[0])) {
401             result.append(uprv_toupper(data[0]), status);
402             result.append(uprv_toupper(data[1]), status);
403         } else {
404             // assume three-digit region code
405             result.append(data, 3, status);
406         }
407     }
408     return result;
409 }
410 }  // namespace
411 
412 U_EXPORT icu::CharString
ulocimp_getRegionForSupplementalData(const char * localeID,bool inferRegion,UErrorCode & status)413 ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
414                                      UErrorCode& status) {
415     if (U_FAILURE(status)) {
416         return {};
417     }
418     icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
419     if (U_SUCCESS(status) && rgBuf.isEmpty()) {
420         // No valid rg keyword value, try for unicode_region_subtag
421         rgBuf = ulocimp_getRegion(localeID, status);
422         if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
423             // Second check for sd keyword value
424             rgBuf = GetRegionFromKey(localeID, "sd", status);
425             if (U_SUCCESS(status) && rgBuf.isEmpty()) {
426                 // no unicode_region_subtag but inferRegion true, try likely subtags
427                 UErrorCode rgStatus = U_ZERO_ERROR;
428                 icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
429                 if (U_SUCCESS(rgStatus)) {
430                     rgBuf = ulocimp_getRegion(locBuf.data(), status);
431                 }
432             }
433         }
434     }
435 
436     return rgBuf;
437 }
438