1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/i18n/rtl.h"
6
7 #include <stddef.h>
8 #include <stdint.h>
9
10 #include <algorithm>
11
12 #include "base/command_line.h"
13 #include "base/files/file_path.h"
14 #include "base/i18n/base_i18n_switches.h"
15 #include "base/logging.h"
16 #include "base/macros.h"
17 #include "base/strings/string_split.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/sys_string_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "build/build_config.h"
22 #include "third_party/icu/source/common/unicode/locid.h"
23 #include "third_party/icu/source/common/unicode/uchar.h"
24 #include "third_party/icu/source/common/unicode/uscript.h"
25 #include "third_party/icu/source/i18n/unicode/coll.h"
26
27 #if defined(OS_IOS)
28 #include "base/debug/crash_logging.h"
29 #include "base/ios/ios_util.h"
30 #endif
31
32 namespace {
33
34 // Extract language, country and variant, but ignore keywords. For example,
35 // en-US, ca@valencia, ca-ES@valencia.
GetLocaleString(const icu::Locale & locale)36 std::string GetLocaleString(const icu::Locale& locale) {
37 const char* language = locale.getLanguage();
38 const char* country = locale.getCountry();
39 const char* variant = locale.getVariant();
40
41 std::string result =
42 (language != nullptr && *language != '\0') ? language : "und";
43
44 if (country != nullptr && *country != '\0') {
45 result += '-';
46 result += country;
47 }
48
49 if (variant != nullptr && *variant != '\0')
50 result += '@' + base::ToLowerASCII(variant);
51
52 return result;
53 }
54
55 // Returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if |character| has strong
56 // directionality, returns UNKNOWN_DIRECTION if it doesn't. Please refer to
57 // http://unicode.org/reports/tr9/ for more information.
GetCharacterDirection(UChar32 character)58 base::i18n::TextDirection GetCharacterDirection(UChar32 character) {
59 static bool has_switch = base::CommandLine::ForCurrentProcess()->HasSwitch(
60 switches::kForceTextDirection);
61 if (has_switch) {
62 base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
63 std::string force_flag =
64 command_line->GetSwitchValueASCII(switches::kForceTextDirection);
65
66 if (force_flag == switches::kForceDirectionRTL)
67 return base::i18n::RIGHT_TO_LEFT;
68 if (force_flag == switches::kForceDirectionLTR)
69 return base::i18n::LEFT_TO_RIGHT;
70 }
71 // Now that we have the character, we use ICU in order to query for the
72 // appropriate Unicode BiDi character type.
73 int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
74 if ((property == U_RIGHT_TO_LEFT) ||
75 (property == U_RIGHT_TO_LEFT_ARABIC) ||
76 (property == U_RIGHT_TO_LEFT_EMBEDDING) ||
77 (property == U_RIGHT_TO_LEFT_OVERRIDE)) {
78 return base::i18n::RIGHT_TO_LEFT;
79 } else if ((property == U_LEFT_TO_RIGHT) ||
80 (property == U_LEFT_TO_RIGHT_EMBEDDING) ||
81 (property == U_LEFT_TO_RIGHT_OVERRIDE)) {
82 return base::i18n::LEFT_TO_RIGHT;
83 }
84 return base::i18n::UNKNOWN_DIRECTION;
85 }
86
87 } // namespace
88
89 namespace base {
90 namespace i18n {
91
92 // Represents the locale-specific ICU text direction.
93 static TextDirection g_icu_text_direction = UNKNOWN_DIRECTION;
94
95 // Convert the ICU default locale to a string.
GetConfiguredLocale()96 std::string GetConfiguredLocale() {
97 return GetLocaleString(icu::Locale::getDefault());
98 }
99
100 // Convert the ICU canonicalized locale to a string.
GetCanonicalLocale(const std::string & locale)101 std::string GetCanonicalLocale(const std::string& locale) {
102 return GetLocaleString(icu::Locale::createCanonical(locale.c_str()));
103 }
104
105 // Convert Chrome locale name to ICU locale name
ICULocaleName(const std::string & locale_string)106 std::string ICULocaleName(const std::string& locale_string) {
107 // If not Spanish, just return it.
108 if (locale_string.substr(0, 2) != "es")
109 return locale_string;
110 // Expand es to es-ES.
111 if (LowerCaseEqualsASCII(locale_string, "es"))
112 return "es-ES";
113 // Map es-419 (Latin American Spanish) to es-FOO depending on the system
114 // locale. If it's es-RR other than es-ES, map to es-RR. Otherwise, map
115 // to es-MX (the most populous in Spanish-speaking Latin America).
116 if (LowerCaseEqualsASCII(locale_string, "es-419")) {
117 const icu::Locale& locale = icu::Locale::getDefault();
118 std::string language = locale.getLanguage();
119 const char* country = locale.getCountry();
120 if (LowerCaseEqualsASCII(language, "es") &&
121 !LowerCaseEqualsASCII(country, "es")) {
122 language += '-';
123 language += country;
124 return language;
125 }
126 return "es-MX";
127 }
128 // Currently, Chrome has only "es" and "es-419", but later we may have
129 // more specific "es-RR".
130 return locale_string;
131 }
132
SetICUDefaultLocale(const std::string & locale_string)133 void SetICUDefaultLocale(const std::string& locale_string) {
134 #if defined(OS_IOS)
135 static base::debug::CrashKeyString* crash_key_locale =
136 base::debug::AllocateCrashKeyString("icu_locale_input",
137 base::debug::CrashKeySize::Size256);
138 base::debug::SetCrashKeyString(crash_key_locale, locale_string);
139 #endif
140 icu::Locale locale(ICULocaleName(locale_string).c_str());
141 UErrorCode error_code = U_ZERO_ERROR;
142 const char* lang = locale.getLanguage();
143 if (lang != nullptr && *lang != '\0') {
144 icu::Locale::setDefault(locale, error_code);
145 } else {
146 LOG(ERROR) << "Failed to set the ICU default locale to " << locale_string
147 << ". Falling back to en-US.";
148 icu::Locale::setDefault(icu::Locale::getUS(), error_code);
149 }
150 g_icu_text_direction = UNKNOWN_DIRECTION;
151 }
152
IsRTL()153 bool IsRTL() {
154 return ICUIsRTL();
155 }
156
SetRTLForTesting(bool rtl)157 void SetRTLForTesting(bool rtl) {
158 SetICUDefaultLocale(rtl ? "he" : "en");
159 DCHECK_EQ(rtl, IsRTL());
160 }
161
ICUIsRTL()162 bool ICUIsRTL() {
163 if (g_icu_text_direction == UNKNOWN_DIRECTION) {
164 const icu::Locale& locale = icu::Locale::getDefault();
165 g_icu_text_direction = GetTextDirectionForLocaleInStartUp(locale.getName());
166 }
167 return g_icu_text_direction == RIGHT_TO_LEFT;
168 }
169
GetForcedTextDirection()170 TextDirection GetForcedTextDirection() {
171 // On iOS, check for RTL forcing.
172 #if defined(OS_IOS)
173 if (base::ios::IsInForcedRTL())
174 return base::i18n::RIGHT_TO_LEFT;
175 #endif
176
177 base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
178 if (command_line->HasSwitch(switches::kForceUIDirection)) {
179 std::string force_flag =
180 command_line->GetSwitchValueASCII(switches::kForceUIDirection);
181
182 if (force_flag == switches::kForceDirectionLTR)
183 return base::i18n::LEFT_TO_RIGHT;
184
185 if (force_flag == switches::kForceDirectionRTL)
186 return base::i18n::RIGHT_TO_LEFT;
187 }
188
189 return base::i18n::UNKNOWN_DIRECTION;
190 }
191
GetTextDirectionForLocaleInStartUp(const char * locale_name)192 TextDirection GetTextDirectionForLocaleInStartUp(const char* locale_name) {
193 // Check for direction forcing.
194 TextDirection forced_direction = GetForcedTextDirection();
195 if (forced_direction != UNKNOWN_DIRECTION)
196 return forced_direction;
197
198 // This list needs to be updated in alphabetical order if we add more RTL
199 // locales.
200 static const char kRTLLanguageCodes[][3] = {"ar", "fa", "he", "iw", "ur"};
201 std::vector<StringPiece> locale_split =
202 SplitStringPiece(locale_name, "-_", KEEP_WHITESPACE, SPLIT_WANT_ALL);
203 const StringPiece& language_code = locale_split[0];
204 if (std::binary_search(kRTLLanguageCodes,
205 kRTLLanguageCodes + arraysize(kRTLLanguageCodes),
206 language_code))
207 return RIGHT_TO_LEFT;
208 return LEFT_TO_RIGHT;
209 }
210
GetTextDirectionForLocale(const char * locale_name)211 TextDirection GetTextDirectionForLocale(const char* locale_name) {
212 // Check for direction forcing.
213 TextDirection forced_direction = GetForcedTextDirection();
214 if (forced_direction != UNKNOWN_DIRECTION)
215 return forced_direction;
216
217 UErrorCode status = U_ZERO_ERROR;
218 ULayoutType layout_dir = uloc_getCharacterOrientation(locale_name, &status);
219 DCHECK(U_SUCCESS(status));
220 // Treat anything other than RTL as LTR.
221 return (layout_dir != ULOC_LAYOUT_RTL) ? LEFT_TO_RIGHT : RIGHT_TO_LEFT;
222 }
223
GetFirstStrongCharacterDirection(const string16 & text)224 TextDirection GetFirstStrongCharacterDirection(const string16& text) {
225 const UChar* string = text.c_str();
226 size_t length = text.length();
227 size_t position = 0;
228 while (position < length) {
229 UChar32 character;
230 size_t next_position = position;
231 U16_NEXT(string, next_position, length, character);
232 TextDirection direction = GetCharacterDirection(character);
233 if (direction != UNKNOWN_DIRECTION)
234 return direction;
235 position = next_position;
236 }
237 return LEFT_TO_RIGHT;
238 }
239
GetLastStrongCharacterDirection(const string16 & text)240 TextDirection GetLastStrongCharacterDirection(const string16& text) {
241 const UChar* string = text.c_str();
242 size_t position = text.length();
243 while (position > 0) {
244 UChar32 character;
245 size_t prev_position = position;
246 U16_PREV(string, 0, prev_position, character);
247 TextDirection direction = GetCharacterDirection(character);
248 if (direction != UNKNOWN_DIRECTION)
249 return direction;
250 position = prev_position;
251 }
252 return LEFT_TO_RIGHT;
253 }
254
GetStringDirection(const string16 & text)255 TextDirection GetStringDirection(const string16& text) {
256 const UChar* string = text.c_str();
257 size_t length = text.length();
258 size_t position = 0;
259
260 TextDirection result(UNKNOWN_DIRECTION);
261 while (position < length) {
262 UChar32 character;
263 size_t next_position = position;
264 U16_NEXT(string, next_position, length, character);
265 TextDirection direction = GetCharacterDirection(character);
266 if (direction != UNKNOWN_DIRECTION) {
267 if (result != UNKNOWN_DIRECTION && result != direction)
268 return UNKNOWN_DIRECTION;
269 result = direction;
270 }
271 position = next_position;
272 }
273
274 // Handle the case of a string not containing any strong directionality
275 // characters defaulting to LEFT_TO_RIGHT.
276 if (result == UNKNOWN_DIRECTION)
277 return LEFT_TO_RIGHT;
278
279 return result;
280 }
281
282 #if defined(OS_WIN)
AdjustStringForLocaleDirection(string16 * text)283 bool AdjustStringForLocaleDirection(string16* text) {
284 if (!IsRTL() || text->empty())
285 return false;
286
287 // Marking the string as LTR if the locale is RTL and the string does not
288 // contain strong RTL characters. Otherwise, mark the string as RTL.
289 bool has_rtl_chars = StringContainsStrongRTLChars(*text);
290 if (!has_rtl_chars)
291 WrapStringWithLTRFormatting(text);
292 else
293 WrapStringWithRTLFormatting(text);
294
295 return true;
296 }
297
UnadjustStringForLocaleDirection(string16 * text)298 bool UnadjustStringForLocaleDirection(string16* text) {
299 if (!IsRTL() || text->empty())
300 return false;
301
302 *text = StripWrappingBidiControlCharacters(*text);
303 return true;
304 }
305 #else
AdjustStringForLocaleDirection(string16 * text)306 bool AdjustStringForLocaleDirection(string16* text) {
307 // On OS X & GTK the directionality of a label is determined by the first
308 // strongly directional character.
309 // However, we want to make sure that in an LTR-language-UI all strings are
310 // left aligned and vice versa.
311 // A problem can arise if we display a string which starts with user input.
312 // User input may be of the opposite directionality to the UI. So the whole
313 // string will be displayed in the opposite directionality, e.g. if we want to
314 // display in an LTR UI [such as US English]:
315 //
316 // EMAN_NOISNETXE is now installed.
317 //
318 // Since EXTENSION_NAME begins with a strong RTL char, the label's
319 // directionality will be set to RTL and the string will be displayed visually
320 // as:
321 //
322 // .is now installed EMAN_NOISNETXE
323 //
324 // In order to solve this issue, we prepend an LRM to the string. An LRM is a
325 // strongly directional LTR char.
326 // We also append an LRM at the end, which ensures that we're in an LTR
327 // context.
328
329 // Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
330 // box so there is no issue with displaying zero-width bidi control characters
331 // on any system. Thus no need for the !IsRTL() check here.
332 if (text->empty())
333 return false;
334
335 bool ui_direction_is_rtl = IsRTL();
336
337 bool has_rtl_chars = StringContainsStrongRTLChars(*text);
338 if (!ui_direction_is_rtl && has_rtl_chars) {
339 WrapStringWithRTLFormatting(text);
340 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
341 kLeftToRightMark);
342 text->push_back(kLeftToRightMark);
343 } else if (ui_direction_is_rtl && has_rtl_chars) {
344 WrapStringWithRTLFormatting(text);
345 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
346 kRightToLeftMark);
347 text->push_back(kRightToLeftMark);
348 } else if (ui_direction_is_rtl) {
349 WrapStringWithLTRFormatting(text);
350 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
351 kRightToLeftMark);
352 text->push_back(kRightToLeftMark);
353 } else {
354 return false;
355 }
356
357 return true;
358 }
359
UnadjustStringForLocaleDirection(string16 * text)360 bool UnadjustStringForLocaleDirection(string16* text) {
361 if (text->empty())
362 return false;
363
364 size_t begin_index = 0;
365 char16 begin = text->at(begin_index);
366 if (begin == kLeftToRightMark ||
367 begin == kRightToLeftMark) {
368 ++begin_index;
369 }
370
371 size_t end_index = text->length() - 1;
372 char16 end = text->at(end_index);
373 if (end == kLeftToRightMark ||
374 end == kRightToLeftMark) {
375 --end_index;
376 }
377
378 string16 unmarked_text =
379 text->substr(begin_index, end_index - begin_index + 1);
380 *text = StripWrappingBidiControlCharacters(unmarked_text);
381 return true;
382 }
383
384 #endif // !OS_WIN
385
EnsureTerminatedDirectionalFormatting(string16 * text)386 void EnsureTerminatedDirectionalFormatting(string16* text) {
387 int count = 0;
388 for (auto c : *text) {
389 if (c == kLeftToRightEmbeddingMark || c == kRightToLeftEmbeddingMark ||
390 c == kLeftToRightOverride || c == kRightToLeftOverride) {
391 ++count;
392 } else if (c == kPopDirectionalFormatting && count > 0) {
393 --count;
394 }
395 }
396 for (int j = 0; j < count; j++)
397 text->push_back(kPopDirectionalFormatting);
398 }
399
SanitizeUserSuppliedString(string16 * text)400 void SanitizeUserSuppliedString(string16* text) {
401 EnsureTerminatedDirectionalFormatting(text);
402 AdjustStringForLocaleDirection(text);
403 }
404
StringContainsStrongRTLChars(const string16 & text)405 bool StringContainsStrongRTLChars(const string16& text) {
406 const UChar* string = text.c_str();
407 size_t length = text.length();
408 size_t position = 0;
409 while (position < length) {
410 UChar32 character;
411 size_t next_position = position;
412 U16_NEXT(string, next_position, length, character);
413
414 // Now that we have the character, we use ICU in order to query for the
415 // appropriate Unicode BiDi character type.
416 int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
417 if ((property == U_RIGHT_TO_LEFT) || (property == U_RIGHT_TO_LEFT_ARABIC))
418 return true;
419
420 position = next_position;
421 }
422
423 return false;
424 }
425
WrapStringWithLTRFormatting(string16 * text)426 void WrapStringWithLTRFormatting(string16* text) {
427 if (text->empty())
428 return;
429
430 // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
431 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
432 kLeftToRightEmbeddingMark);
433
434 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
435 text->push_back(kPopDirectionalFormatting);
436 }
437
WrapStringWithRTLFormatting(string16 * text)438 void WrapStringWithRTLFormatting(string16* text) {
439 if (text->empty())
440 return;
441
442 // Inserting an RLE (Right-To-Left Embedding) mark as the first character.
443 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
444 kRightToLeftEmbeddingMark);
445
446 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
447 text->push_back(kPopDirectionalFormatting);
448 }
449
WrapPathWithLTRFormatting(const FilePath & path,string16 * rtl_safe_path)450 void WrapPathWithLTRFormatting(const FilePath& path,
451 string16* rtl_safe_path) {
452 // Wrap the overall path with LRE-PDF pair which essentialy marks the
453 // string as a Left-To-Right string.
454 // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
455 rtl_safe_path->push_back(kLeftToRightEmbeddingMark);
456 #if defined(OS_MACOSX)
457 rtl_safe_path->append(UTF8ToUTF16(path.value()));
458 #elif defined(OS_WIN)
459 rtl_safe_path->append(path.value());
460 #else // defined(OS_POSIX) && !defined(OS_MACOSX)
461 std::wstring wide_path = base::SysNativeMBToWide(path.value());
462 rtl_safe_path->append(WideToUTF16(wide_path));
463 #endif
464 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
465 rtl_safe_path->push_back(kPopDirectionalFormatting);
466 }
467
GetDisplayStringInLTRDirectionality(const string16 & text)468 string16 GetDisplayStringInLTRDirectionality(const string16& text) {
469 // Always wrap the string in RTL UI (it may be appended to RTL string).
470 // Also wrap strings with an RTL first strong character direction in LTR UI.
471 if (IsRTL() || GetFirstStrongCharacterDirection(text) == RIGHT_TO_LEFT) {
472 string16 text_mutable(text);
473 WrapStringWithLTRFormatting(&text_mutable);
474 return text_mutable;
475 }
476 return text;
477 }
478
StripWrappingBidiControlCharacters(const string16 & text)479 string16 StripWrappingBidiControlCharacters(const string16& text) {
480 if (text.empty())
481 return text;
482 size_t begin_index = 0;
483 char16 begin = text[begin_index];
484 if (begin == kLeftToRightEmbeddingMark ||
485 begin == kRightToLeftEmbeddingMark ||
486 begin == kLeftToRightOverride ||
487 begin == kRightToLeftOverride)
488 ++begin_index;
489 size_t end_index = text.length() - 1;
490 if (text[end_index] == kPopDirectionalFormatting)
491 --end_index;
492 return text.substr(begin_index, end_index - begin_index + 1);
493 }
494
495 } // namespace i18n
496 } // namespace base
497