1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1997-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: loclikely.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2010feb25
16 * created by: Markus W. Scherer
17 *
18 * Code for likely and minimized locale subtags, separated out from other .cpp files
19 * that then do not depend on resource bundle code and likely-subtags data.
20 */
21
22 #include <utility>
23
24 #include "unicode/bytestream.h"
25 #include "unicode/utypes.h"
26 #include "unicode/locid.h"
27 #include "unicode/putil.h"
28 #include "unicode/uchar.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ures.h"
31 #include "unicode/uscript.h"
32 #include "bytesinkutil.h"
33 #include "charstr.h"
34 #include "cmemory.h"
35 #include "cstring.h"
36 #include "loclikelysubtags.h"
37 #include "ulocimp.h"
38
39 namespace {
40
41 /**
42 * Create a tag string from the supplied parameters. The lang, script and region
43 * parameters may be nullptr pointers. If they are, their corresponding length parameters
44 * must be less than or equal to 0.
45 *
46 * If an illegal argument is provided, the function returns the error
47 * U_ILLEGAL_ARGUMENT_ERROR.
48 *
49 * @param lang The language tag to use.
50 * @param langLength The length of the language tag.
51 * @param script The script tag to use.
52 * @param scriptLength The length of the script tag.
53 * @param region The region tag to use.
54 * @param regionLength The length of the region tag.
55 * @param variant The region tag to use.
56 * @param variantLength The length of the region tag.
57 * @param trailing Any trailing data to append to the new tag.
58 * @param trailingLength The length of the trailing data.
59 * @param sink The output sink receiving the tag string.
60 * @param err A pointer to a UErrorCode for error reporting.
61 **/
62 void U_CALLCONV
createTagStringWithAlternates(const char * lang,int32_t langLength,const char * script,int32_t scriptLength,const char * region,int32_t regionLength,const char * variant,int32_t variantLength,const char * trailing,int32_t trailingLength,icu::ByteSink & sink,UErrorCode & err)63 createTagStringWithAlternates(
64 const char* lang,
65 int32_t langLength,
66 const char* script,
67 int32_t scriptLength,
68 const char* region,
69 int32_t regionLength,
70 const char* variant,
71 int32_t variantLength,
72 const char* trailing,
73 int32_t trailingLength,
74 icu::ByteSink& sink,
75 UErrorCode& err) {
76 if (U_FAILURE(err)) {
77 return;
78 }
79
80 if (langLength >= ULOC_LANG_CAPACITY ||
81 scriptLength >= ULOC_SCRIPT_CAPACITY ||
82 regionLength >= ULOC_COUNTRY_CAPACITY) {
83 err = U_ILLEGAL_ARGUMENT_ERROR;
84 return;
85 }
86
87 if (langLength > 0) {
88 sink.Append(lang, langLength);
89 }
90
91 if (scriptLength > 0) {
92 sink.Append("_", 1);
93 sink.Append(script, scriptLength);
94 }
95
96 if (regionLength > 0) {
97 sink.Append("_", 1);
98 sink.Append(region, regionLength);
99 }
100
101 if (variantLength > 0) {
102 if (regionLength == 0) {
103 /* extra separator is required */
104 sink.Append("_", 1);
105 }
106 sink.Append("_", 1);
107 sink.Append(variant, variantLength);
108 }
109
110 if (trailingLength > 0) {
111 /*
112 * Copy the trailing data into the supplied buffer.
113 */
114 sink.Append(trailing, trailingLength);
115 }
116 }
117
CHECK_TRAILING_VARIANT_SIZE(const char * variant,int32_t variantLength)118 bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
119 int32_t count = 0;
120 for (int32_t i = 0; i < variantLength; i++) {
121 if (_isIDSeparator(variant[i])) {
122 count = 0;
123 } else if (count == 8) {
124 return false;
125 } else {
126 count++;
127 }
128 }
129 return true;
130 }
131
132 void
_uloc_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode & err)133 _uloc_addLikelySubtags(const char* localeID,
134 icu::ByteSink& sink,
135 UErrorCode& err) {
136 if (U_FAILURE(err)) {
137 return;
138 }
139
140 if (localeID == nullptr) {
141 err = U_ILLEGAL_ARGUMENT_ERROR;
142 return;
143 }
144
145 icu::CharString lang;
146 icu::CharString script;
147 icu::CharString region;
148 icu::CharString variant;
149 const char* trailing = nullptr;
150 ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err);
151 if (U_FAILURE(err)) {
152 return;
153 }
154
155 if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
156 err = U_ILLEGAL_ARGUMENT_ERROR;
157 return;
158 }
159
160 if (lang.length() == 4) {
161 if (script.isEmpty()) {
162 script = std::move(lang);
163 lang.clear();
164 } else {
165 err = U_ILLEGAL_ARGUMENT_ERROR;
166 return;
167 }
168 } else if (lang.length() > 8) {
169 err = U_ILLEGAL_ARGUMENT_ERROR;
170 return;
171 }
172
173 int32_t trailingLength = (int32_t)uprv_strlen(trailing);
174
175 const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
176 if (U_FAILURE(err)) {
177 return;
178 }
179 // We need to keep l on the stack because lsr may point into internal
180 // memory of l.
181 icu::Locale l = icu::Locale::createFromName(localeID);
182 if (l.isBogus()) {
183 err = U_ILLEGAL_ARGUMENT_ERROR;
184 return;
185 }
186 icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
187 if (U_FAILURE(err)) {
188 return;
189 }
190 const char* language = lsr.language;
191 if (uprv_strcmp(language, "und") == 0) {
192 language = "";
193 }
194 createTagStringWithAlternates(
195 language,
196 (int32_t)uprv_strlen(language),
197 lsr.script,
198 (int32_t)uprv_strlen(lsr.script),
199 lsr.region,
200 (int32_t)uprv_strlen(lsr.region),
201 variant.data(),
202 variant.length(),
203 trailing,
204 trailingLength,
205 sink,
206 err);
207 }
208
209 void
_uloc_minimizeSubtags(const char * localeID,icu::ByteSink & sink,bool favorScript,UErrorCode & err)210 _uloc_minimizeSubtags(const char* localeID,
211 icu::ByteSink& sink,
212 bool favorScript,
213 UErrorCode& err) {
214 if (U_FAILURE(err)) {
215 return;
216 }
217
218 if (localeID == nullptr) {
219 err = U_ILLEGAL_ARGUMENT_ERROR;
220 return;
221 }
222
223 icu::CharString lang;
224 icu::CharString script;
225 icu::CharString region;
226 icu::CharString variant;
227 const char* trailing = nullptr;
228 ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err);
229 if (U_FAILURE(err)) {
230 return;
231 }
232
233 if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
234 err = U_ILLEGAL_ARGUMENT_ERROR;
235 return;
236 }
237
238 int32_t trailingLength = (int32_t)uprv_strlen(trailing);
239
240 const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
241 if (U_FAILURE(err)) {
242 return;
243 }
244 icu::LSR lsr = likelySubtags->minimizeSubtags(
245 lang.toStringPiece(),
246 script.toStringPiece(),
247 region.toStringPiece(),
248 favorScript,
249 err);
250 if (U_FAILURE(err)) {
251 return;
252 }
253 const char* language = lsr.language;
254 if (uprv_strcmp(language, "und") == 0) {
255 language = "";
256 }
257 createTagStringWithAlternates(
258 language,
259 (int32_t)uprv_strlen(language),
260 lsr.script,
261 (int32_t)uprv_strlen(lsr.script),
262 lsr.region,
263 (int32_t)uprv_strlen(lsr.region),
264 variant.data(),
265 variant.length(),
266 trailing,
267 trailingLength,
268 sink,
269 err);
270 }
271
272 } // namespace
273
274 U_CAPI int32_t U_EXPORT2
uloc_addLikelySubtags(const char * localeID,char * maximizedLocaleID,int32_t maximizedLocaleIDCapacity,UErrorCode * status)275 uloc_addLikelySubtags(const char* localeID,
276 char* maximizedLocaleID,
277 int32_t maximizedLocaleIDCapacity,
278 UErrorCode* status) {
279 return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
280 maximizedLocaleID, maximizedLocaleIDCapacity,
281 [&](icu::ByteSink& sink, UErrorCode& status) {
282 ulocimp_addLikelySubtags(localeID, sink, status);
283 },
284 *status);
285 }
286
287 U_EXPORT icu::CharString
ulocimp_addLikelySubtags(const char * localeID,UErrorCode & status)288 ulocimp_addLikelySubtags(const char* localeID,
289 UErrorCode& status) {
290 return icu::ByteSinkUtil::viaByteSinkToCharString(
291 [&](icu::ByteSink& sink, UErrorCode& status) {
292 ulocimp_addLikelySubtags(localeID, sink, status);
293 },
294 status);
295 }
296
297 U_EXPORT void
ulocimp_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode & status)298 ulocimp_addLikelySubtags(const char* localeID,
299 icu::ByteSink& sink,
300 UErrorCode& status) {
301 if (U_FAILURE(status)) { return; }
302 icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
303 _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
304 }
305
306 U_CAPI int32_t U_EXPORT2
uloc_minimizeSubtags(const char * localeID,char * minimizedLocaleID,int32_t minimizedLocaleIDCapacity,UErrorCode * status)307 uloc_minimizeSubtags(const char* localeID,
308 char* minimizedLocaleID,
309 int32_t minimizedLocaleIDCapacity,
310 UErrorCode* status) {
311 return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
312 minimizedLocaleID, minimizedLocaleIDCapacity,
313 [&](icu::ByteSink& sink, UErrorCode& status) {
314 ulocimp_minimizeSubtags(localeID, sink, false, status);
315 },
316 *status);
317 }
318
319 U_EXPORT icu::CharString
ulocimp_minimizeSubtags(const char * localeID,bool favorScript,UErrorCode & status)320 ulocimp_minimizeSubtags(const char* localeID,
321 bool favorScript,
322 UErrorCode& status) {
323 return icu::ByteSinkUtil::viaByteSinkToCharString(
324 [&](icu::ByteSink& sink, UErrorCode& status) {
325 ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
326 },
327 status);
328 }
329
330 U_EXPORT void
ulocimp_minimizeSubtags(const char * localeID,icu::ByteSink & sink,bool favorScript,UErrorCode & status)331 ulocimp_minimizeSubtags(const char* localeID,
332 icu::ByteSink& sink,
333 bool favorScript,
334 UErrorCode& status) {
335 if (U_FAILURE(status)) { return; }
336 icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
337 _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
338 }
339
340 // Pairs of (language subtag, + or -) for finding out fast if common languages
341 // are LTR (minus) or RTL (plus).
342 static const char LANG_DIR_STRING[] =
343 "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
344
345 // Implemented here because this calls ulocimp_addLikelySubtags().
346 U_CAPI UBool U_EXPORT2
uloc_isRightToLeft(const char * locale)347 uloc_isRightToLeft(const char *locale) {
348 UErrorCode errorCode = U_ZERO_ERROR;
349 icu::CharString lang;
350 icu::CharString script;
351 ulocimp_getSubtags(locale, &lang, &script, nullptr, nullptr, nullptr, errorCode);
352 if (U_FAILURE(errorCode) || script.isEmpty()) {
353 // Fastpath: We know the likely scripts and their writing direction
354 // for some common languages.
355 if (!lang.isEmpty()) {
356 const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
357 if (langPtr != nullptr) {
358 switch (langPtr[lang.length()]) {
359 case '-': return false;
360 case '+': return true;
361 default: break; // partial match of a longer code
362 }
363 }
364 }
365 // Otherwise, find the likely script.
366 errorCode = U_ZERO_ERROR;
367 icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
368 if (U_FAILURE(errorCode)) {
369 return false;
370 }
371 ulocimp_getSubtags(likely.data(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
372 if (U_FAILURE(errorCode) || script.isEmpty()) {
373 return false;
374 }
375 }
376 UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
377 return uscript_isRightToLeft(scriptCode);
378 }
379
380 U_NAMESPACE_BEGIN
381
382 UBool
isRightToLeft() const383 Locale::isRightToLeft() const {
384 return uloc_isRightToLeft(getBaseName());
385 }
386
387 U_NAMESPACE_END
388
389 namespace {
390 icu::CharString
GetRegionFromKey(const char * localeID,const char * key,UErrorCode & status)391 GetRegionFromKey(const char* localeID, const char* key, UErrorCode& status) {
392 icu::CharString result;
393
394 // First check for keyword value
395 icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
396 int32_t len = kw.length();
397 if (U_SUCCESS(status) && len >= 3 && len <= 7) {
398 // chop off the subdivision code (which will generally be "zzzz" anyway)
399 const char* const data = kw.data();
400 if (uprv_isASCIILetter(data[0])) {
401 result.append(uprv_toupper(data[0]), status);
402 result.append(uprv_toupper(data[1]), status);
403 } else {
404 // assume three-digit region code
405 result.append(data, 3, status);
406 }
407 }
408 return result;
409 }
410 } // namespace
411
412 U_EXPORT icu::CharString
ulocimp_getRegionForSupplementalData(const char * localeID,bool inferRegion,UErrorCode & status)413 ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
414 UErrorCode& status) {
415 if (U_FAILURE(status)) {
416 return {};
417 }
418 icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
419 if (U_SUCCESS(status) && rgBuf.isEmpty()) {
420 // No valid rg keyword value, try for unicode_region_subtag
421 rgBuf = ulocimp_getRegion(localeID, status);
422 if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
423 // Second check for sd keyword value
424 rgBuf = GetRegionFromKey(localeID, "sd", status);
425 if (U_SUCCESS(status) && rgBuf.isEmpty()) {
426 // no unicode_region_subtag but inferRegion true, try likely subtags
427 UErrorCode rgStatus = U_ZERO_ERROR;
428 icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
429 if (U_SUCCESS(rgStatus)) {
430 rgBuf = ulocimp_getRegion(locBuf.data(), status);
431 }
432 }
433 }
434 }
435
436 return rgBuf;
437 }
438