1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_INTL_SUPPORT
6 #error Internationalization is expected to be enabled.
7 #endif // V8_INTL_SUPPORT
8
9 #include "src/objects/js-collator.h"
10
11 #include "src/isolate.h"
12 #include "src/objects-inl.h"
13 #include "src/objects/js-collator-inl.h"
14 #include "unicode/coll.h"
15 #include "unicode/locid.h"
16 #include "unicode/strenum.h"
17 #include "unicode/ucol.h"
18 #include "unicode/uloc.h"
19
20 namespace v8 {
21 namespace internal {
22
23 namespace {
24
25 // TODO(gsathya): Consider internalizing the value strings.
CreateDataPropertyForOptions(Isolate * isolate,Handle<JSObject> options,Handle<String> key,const char * value)26 void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
27 Handle<String> key, const char* value) {
28 CHECK_NOT_NULL(value);
29 Handle<String> value_str =
30 isolate->factory()->NewStringFromAsciiChecked(value);
31
32 // This is a brand new JSObject that shouldn't already have the same
33 // key so this shouldn't fail.
34 CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_str,
35 kDontThrow)
36 .FromJust());
37 }
38
CreateDataPropertyForOptions(Isolate * isolate,Handle<JSObject> options,Handle<String> key,bool value)39 void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
40 Handle<String> key, bool value) {
41 Handle<Object> value_obj = isolate->factory()->ToBoolean(value);
42
43 // This is a brand new JSObject that shouldn't already have the same
44 // key so this shouldn't fail.
45 CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_obj,
46 kDontThrow)
47 .FromJust());
48 }
49
50 } // anonymous namespace
51
52 // static
ResolvedOptions(Isolate * isolate,Handle<JSCollator> collator)53 Handle<JSObject> JSCollator::ResolvedOptions(Isolate* isolate,
54 Handle<JSCollator> collator) {
55 Handle<JSObject> options =
56 isolate->factory()->NewJSObject(isolate->object_function());
57
58 JSCollator::Usage usage = collator->usage();
59 CreateDataPropertyForOptions(isolate, options,
60 isolate->factory()->usage_string(),
61 JSCollator::UsageToString(usage));
62
63 icu::Collator* icu_collator = collator->icu_collator()->raw();
64 CHECK_NOT_NULL(icu_collator);
65
66 UErrorCode status = U_ZERO_ERROR;
67 bool numeric =
68 icu_collator->getAttribute(UCOL_NUMERIC_COLLATION, status) == UCOL_ON;
69 CHECK(U_SUCCESS(status));
70 CreateDataPropertyForOptions(isolate, options,
71 isolate->factory()->numeric_string(), numeric);
72
73 const char* case_first = nullptr;
74 status = U_ZERO_ERROR;
75 switch (icu_collator->getAttribute(UCOL_CASE_FIRST, status)) {
76 case UCOL_LOWER_FIRST:
77 case_first = "lower";
78 break;
79 case UCOL_UPPER_FIRST:
80 case_first = "upper";
81 break;
82 default:
83 case_first = "false";
84 }
85 CHECK(U_SUCCESS(status));
86 CreateDataPropertyForOptions(
87 isolate, options, isolate->factory()->caseFirst_string(), case_first);
88
89 const char* sensitivity = nullptr;
90 status = U_ZERO_ERROR;
91 switch (icu_collator->getAttribute(UCOL_STRENGTH, status)) {
92 case UCOL_PRIMARY: {
93 CHECK(U_SUCCESS(status));
94 status = U_ZERO_ERROR;
95 // case level: true + s1 -> case, s1 -> base.
96 if (UCOL_ON == icu_collator->getAttribute(UCOL_CASE_LEVEL, status)) {
97 sensitivity = "case";
98 } else {
99 sensitivity = "base";
100 }
101 CHECK(U_SUCCESS(status));
102 break;
103 }
104 case UCOL_SECONDARY:
105 sensitivity = "accent";
106 break;
107 case UCOL_TERTIARY:
108 sensitivity = "variant";
109 break;
110 case UCOL_QUATERNARY:
111 // We shouldn't get quaternary and identical from ICU, but if we do
112 // put them into variant.
113 sensitivity = "variant";
114 break;
115 default:
116 sensitivity = "variant";
117 }
118 CHECK(U_SUCCESS(status));
119 CreateDataPropertyForOptions(
120 isolate, options, isolate->factory()->sensitivity_string(), sensitivity);
121
122 status = U_ZERO_ERROR;
123 bool ignore_punctuation = icu_collator->getAttribute(UCOL_ALTERNATE_HANDLING,
124 status) == UCOL_SHIFTED;
125 CHECK(U_SUCCESS(status));
126 CreateDataPropertyForOptions(isolate, options,
127 isolate->factory()->ignorePunctuation_string(),
128 ignore_punctuation);
129
130 status = U_ZERO_ERROR;
131 const char* collation;
132 std::unique_ptr<icu::StringEnumeration> collation_values(
133 icu_collator->getKeywordValues("co", status));
134 // Collation wasn't provided as a keyword to icu, use default.
135 if (status == U_ILLEGAL_ARGUMENT_ERROR) {
136 CreateDataPropertyForOptions(
137 isolate, options, isolate->factory()->collation_string(), "default");
138 } else {
139 CHECK(U_SUCCESS(status));
140 CHECK_NOT_NULL(collation_values.get());
141
142 int32_t length;
143 status = U_ZERO_ERROR;
144 collation = collation_values->next(&length, status);
145 CHECK(U_SUCCESS(status));
146
147 // There has to be at least one value.
148 CHECK_NOT_NULL(collation);
149 CreateDataPropertyForOptions(
150 isolate, options, isolate->factory()->collation_string(), collation);
151
152 status = U_ZERO_ERROR;
153 collation_values->reset(status);
154 CHECK(U_SUCCESS(status));
155 }
156
157 status = U_ZERO_ERROR;
158 icu::Locale icu_locale = icu_collator->getLocale(ULOC_VALID_LOCALE, status);
159 CHECK(U_SUCCESS(status));
160
161 char result[ULOC_FULLNAME_CAPACITY];
162 status = U_ZERO_ERROR;
163 uloc_toLanguageTag(icu_locale.getName(), result, ULOC_FULLNAME_CAPACITY,
164 FALSE, &status);
165 CHECK(U_SUCCESS(status));
166
167 CreateDataPropertyForOptions(isolate, options,
168 isolate->factory()->locale_string(), result);
169
170 return options;
171 }
172
173 namespace {
174
LookupUnicodeExtensions(const icu::Locale & icu_locale,const std::set<std::string> & relevant_keys)175 std::map<std::string, std::string> LookupUnicodeExtensions(
176 const icu::Locale& icu_locale, const std::set<std::string>& relevant_keys) {
177 std::map<std::string, std::string> extensions;
178
179 UErrorCode status = U_ZERO_ERROR;
180 std::unique_ptr<icu::StringEnumeration> keywords(
181 icu_locale.createKeywords(status));
182 if (U_FAILURE(status)) return extensions;
183
184 if (!keywords) return extensions;
185 char value[ULOC_FULLNAME_CAPACITY];
186
187 int32_t length;
188 status = U_ZERO_ERROR;
189 for (const char* keyword = keywords->next(&length, status);
190 keyword != nullptr; keyword = keywords->next(&length, status)) {
191 // Ignore failures in ICU and skip to the next keyword.
192 //
193 // This is fine.™
194 if (U_FAILURE(status)) {
195 status = U_ZERO_ERROR;
196 continue;
197 }
198
199 icu_locale.getKeywordValue(keyword, value, ULOC_FULLNAME_CAPACITY, status);
200
201 // Ignore failures in ICU and skip to the next keyword.
202 //
203 // This is fine.™
204 if (U_FAILURE(status)) {
205 status = U_ZERO_ERROR;
206 continue;
207 }
208
209 const char* bcp47_key = uloc_toUnicodeLocaleKey(keyword);
210
211 // Ignore keywords that we don't recognize - spec allows that.
212 if (bcp47_key && (relevant_keys.find(bcp47_key) != relevant_keys.end())) {
213 const char* bcp47_value = uloc_toUnicodeLocaleType(bcp47_key, value);
214 extensions.insert(
215 std::pair<std::string, std::string>(bcp47_key, bcp47_value));
216 }
217 }
218
219 return extensions;
220 }
221
SetCaseFirstOption(icu::Collator * icu_collator,const char * value)222 void SetCaseFirstOption(icu::Collator* icu_collator, const char* value) {
223 CHECK_NOT_NULL(icu_collator);
224 CHECK_NOT_NULL(value);
225 UErrorCode status = U_ZERO_ERROR;
226 if (strcmp(value, "upper") == 0) {
227 icu_collator->setAttribute(UCOL_CASE_FIRST, UCOL_UPPER_FIRST, status);
228 } else if (strcmp(value, "lower") == 0) {
229 icu_collator->setAttribute(UCOL_CASE_FIRST, UCOL_LOWER_FIRST, status);
230 } else {
231 icu_collator->setAttribute(UCOL_CASE_FIRST, UCOL_OFF, status);
232 }
233 CHECK(U_SUCCESS(status));
234 }
235
236 } // anonymous namespace
237
238 // static
InitializeCollator(Isolate * isolate,Handle<JSCollator> collator,Handle<Object> locales,Handle<Object> options_obj)239 MaybeHandle<JSCollator> JSCollator::InitializeCollator(
240 Isolate* isolate, Handle<JSCollator> collator, Handle<Object> locales,
241 Handle<Object> options_obj) {
242 // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
243 Handle<JSObject> requested_locales;
244 ASSIGN_RETURN_ON_EXCEPTION(isolate, requested_locales,
245 Intl::CanonicalizeLocaleListJS(isolate, locales),
246 JSCollator);
247
248 // 2. If options is undefined, then
249 if (options_obj->IsUndefined(isolate)) {
250 // 2. a. Let options be ObjectCreate(null).
251 options_obj = isolate->factory()->NewJSObjectWithNullProto();
252 } else {
253 // 3. Else
254 // 3. a. Let options be ? ToObject(options).
255 ASSIGN_RETURN_ON_EXCEPTION(
256 isolate, options_obj,
257 Object::ToObject(isolate, options_obj, "Intl.Collator"), JSCollator);
258 }
259
260 // At this point, options_obj can either be a JSObject or a JSProxy only.
261 Handle<JSReceiver> options = Handle<JSReceiver>::cast(options_obj);
262
263 // 4. Let usage be ? GetOption(options, "usage", "string", « "sort",
264 // "search" », "sort").
265 std::vector<const char*> values = {"sort", "search"};
266 std::unique_ptr<char[]> usage_str = nullptr;
267 JSCollator::Usage usage = JSCollator::Usage::SORT;
268 Maybe<bool> found_usage = Intl::GetStringOption(
269 isolate, options, "usage", values, "Intl.Collator", &usage_str);
270 MAYBE_RETURN(found_usage, MaybeHandle<JSCollator>());
271
272 if (found_usage.FromJust()) {
273 DCHECK_NOT_NULL(usage_str.get());
274 if (strcmp(usage_str.get(), "search") == 0) {
275 usage = JSCollator::Usage::SEARCH;
276 }
277 }
278
279 // 5. Set collator.[[Usage]] to usage.
280 collator->set_usage(usage);
281
282 // 6. If usage is "sort", then
283 // a. Let localeData be %Collator%.[[SortLocaleData]].
284 // 7. Else,
285 // a. Let localeData be %Collator%.[[SearchLocaleData]].
286 //
287 // The above two spec operations aren't required, the Intl spec is
288 // crazy. See https://github.com/tc39/ecma402/issues/256
289
290 // TODO(gsathya): This is currently done as part of the
291 // Intl::ResolveLocale call below. Fix this once resolveLocale is
292 // changed to not do the lookup.
293 //
294 // 9. Let matcher be ? GetOption(options, "localeMatcher", "string",
295 // « "lookup", "best fit" », "best fit").
296 // 10. Set opt.[[localeMatcher]] to matcher.
297
298 // 11. Let numeric be ? GetOption(options, "numeric", "boolean",
299 // undefined, undefined).
300 // 12. If numeric is not undefined, then
301 // a. Let numeric be ! ToString(numeric).
302 //
303 // Note: We omit the ToString(numeric) operation as it's not
304 // observable. Intl::GetBoolOption returns a Boolean and
305 // ToString(Boolean) is not side-effecting.
306 //
307 // 13. Set opt.[[kn]] to numeric.
308 bool numeric;
309 Maybe<bool> found_numeric = Intl::GetBoolOption(isolate, options, "numeric",
310 "Intl.Collator", &numeric);
311 MAYBE_RETURN(found_numeric, MaybeHandle<JSCollator>());
312
313 // 14. Let caseFirst be ? GetOption(options, "caseFirst", "string",
314 // « "upper", "lower", "false" », undefined).
315 // 15. Set opt.[[kf]] to caseFirst.
316 values = {"upper", "lower", "false"};
317 std::unique_ptr<char[]> case_first_str = nullptr;
318 Maybe<bool> found_case_first = Intl::GetStringOption(
319 isolate, options, "caseFirst", values, "Intl.Collator", &case_first_str);
320 MAYBE_RETURN(found_case_first, MaybeHandle<JSCollator>());
321
322 // The relevant unicode extensions accepted by Collator as specified here:
323 // https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots
324 //
325 // 16. Let relevantExtensionKeys be %Collator%.[[RelevantExtensionKeys]].
326 std::set<std::string> relevant_extension_keys{"co", "kn", "kf"};
327
328 // We don't pass the relevant_extension_keys to ResolveLocale here
329 // as per the spec.
330 //
331 // In ResolveLocale, the spec makes sure we only pick and use the
332 // relevant extension keys and ignore any other keys. Also, in
333 // ResolveLocale, the spec makes sure that if a given key has both a
334 // value in the options object and an unicode extension value, then
335 // we pick the value provided in the options object.
336 // For example: in the case of `new Intl.Collator('en-u-kn-true', {
337 // numeric: false })` the value `false` is used for the `numeric`
338 // key.
339 //
340 // Instead of performing all this validation in ResolveLocale, we
341 // just perform it inline below. In the future when we port
342 // ResolveLocale to C++, we can make all these validations generic
343 // and move it ResolveLocale.
344 //
345 // 17. Let r be ResolveLocale(%Collator%.[[AvailableLocales]],
346 // requestedLocales, opt, %Collator%.[[RelevantExtensionKeys]],
347 // localeData).
348 // 18. Set collator.[[Locale]] to r.[[locale]].
349 Handle<JSObject> r;
350 ASSIGN_RETURN_ON_EXCEPTION(
351 isolate, r,
352 Intl::ResolveLocale(isolate, "collator", requested_locales, options),
353 JSCollator);
354
355 Handle<String> locale_with_extension_str =
356 isolate->factory()->NewStringFromStaticChars("localeWithExtension");
357 Handle<Object> locale_with_extension_obj =
358 JSObject::GetDataProperty(r, locale_with_extension_str);
359
360 // The locale_with_extension has to be a string. Either a user
361 // provided canonicalized string or the default locale.
362 CHECK(locale_with_extension_obj->IsString());
363 Handle<String> locale_with_extension =
364 Handle<String>::cast(locale_with_extension_obj);
365
366 icu::Locale icu_locale =
367 Intl::CreateICULocale(isolate, locale_with_extension);
368 DCHECK(!icu_locale.isBogus());
369
370 std::map<std::string, std::string> extensions =
371 LookupUnicodeExtensions(icu_locale, relevant_extension_keys);
372
373 // 19. Let collation be r.[[co]].
374 //
375 // r.[[co]] is already set as part of the icu::Locale creation as
376 // icu parses unicode extensions and sets the keywords.
377 //
378 // We need to sanitize the keywords based on certain ECMAScript rules.
379 //
380 // As per https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots:
381 // The values "standard" and "search" must not be used as elements
382 // in any [[SortLocaleData]][locale].co and
383 // [[SearchLocaleData]][locale].co list.
384 auto co_extension_it = extensions.find("co");
385 if (co_extension_it != extensions.end()) {
386 const std::string& value = co_extension_it->second;
387 if ((value == "search") || (value == "standard")) {
388 UErrorCode status = U_ZERO_ERROR;
389 icu_locale.setKeywordValue("co", NULL, status);
390 CHECK(U_SUCCESS(status));
391 }
392 }
393
394 // 20. If collation is null, let collation be "default".
395 // 21. Set collator.[[Collation]] to collation.
396 //
397 // We don't store the collation value as per the above two steps
398 // here. The collation value can be looked up from icu::Collator on
399 // demand, as part of Intl.Collator.prototype.resolvedOptions.
400
401 UErrorCode status = U_ZERO_ERROR;
402 std::unique_ptr<icu::Collator> icu_collator(
403 icu::Collator::createInstance(icu_locale, status));
404 if (U_FAILURE(status) || icu_collator.get() == nullptr) {
405 status = U_ZERO_ERROR;
406 // Remove extensions and try again.
407 icu::Locale no_extension_locale(icu_locale.getBaseName());
408 icu_collator.reset(
409 icu::Collator::createInstance(no_extension_locale, status));
410
411 if (U_FAILURE(status) || icu_collator.get() == nullptr) {
412 FATAL("Failed to create ICU collator, are ICU data files missing?");
413 }
414 }
415 DCHECK(U_SUCCESS(status));
416 CHECK_NOT_NULL(icu_collator.get());
417
418 // 22. If relevantExtensionKeys contains "kn", then
419 // a. Set collator.[[Numeric]] to ! SameValue(r.[[kn]], "true").
420 //
421 // If the numeric value is passed in through the options object,
422 // then we use it. Otherwise, we check if the numeric value is
423 // passed in through the unicode extensions.
424 status = U_ZERO_ERROR;
425 if (found_numeric.FromJust()) {
426 icu_collator->setAttribute(UCOL_NUMERIC_COLLATION,
427 numeric ? UCOL_ON : UCOL_OFF, status);
428 CHECK(U_SUCCESS(status));
429 } else {
430 auto kn_extension_it = extensions.find("kn");
431 if (kn_extension_it != extensions.end()) {
432 const std::string& value = kn_extension_it->second;
433
434 numeric = (value == "true");
435
436 icu_collator->setAttribute(UCOL_NUMERIC_COLLATION,
437 numeric ? UCOL_ON : UCOL_OFF, status);
438 CHECK(U_SUCCESS(status));
439 }
440 }
441
442 // 23. If relevantExtensionKeys contains "kf", then
443 // a. Set collator.[[CaseFirst]] to r.[[kf]].
444 //
445 // If the caseFirst value is passed in through the options object,
446 // then we use it. Otherwise, we check if the caseFirst value is
447 // passed in through the unicode extensions.
448 if (found_case_first.FromJust()) {
449 const char* case_first_cstr = case_first_str.get();
450 SetCaseFirstOption(icu_collator.get(), case_first_cstr);
451 } else {
452 auto kf_extension_it = extensions.find("kf");
453 if (kf_extension_it != extensions.end()) {
454 const std::string& value = kf_extension_it->second;
455 SetCaseFirstOption(icu_collator.get(), value.c_str());
456 }
457 }
458
459 // Normalization is always on, by the spec. We are free to optimize
460 // if the strings are already normalized (but we don't have a way to tell
461 // that right now).
462 status = U_ZERO_ERROR;
463 icu_collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
464 CHECK(U_SUCCESS(status));
465
466 // 24. Let sensitivity be ? GetOption(options, "sensitivity",
467 // "string", « "base", "accent", "case", "variant" », undefined).
468 values = {"base", "accent", "case", "variant"};
469 std::unique_ptr<char[]> sensitivity_str = nullptr;
470 Maybe<bool> found_sensitivity =
471 Intl::GetStringOption(isolate, options, "sensitivity", values,
472 "Intl.Collator", &sensitivity_str);
473 MAYBE_RETURN(found_sensitivity, MaybeHandle<JSCollator>());
474
475 // 25. If sensitivity is undefined, then
476 if (!found_sensitivity.FromJust()) {
477 // 25. a. If usage is "sort", then
478 if (usage == Usage::SORT) {
479 // 25. a. i. Let sensitivity be "variant".
480 // 26. Set collator.[[Sensitivity]] to sensitivity.
481 icu_collator->setStrength(icu::Collator::TERTIARY);
482 }
483 } else {
484 DCHECK(found_sensitivity.FromJust());
485 const char* sensitivity_cstr = sensitivity_str.get();
486 DCHECK_NOT_NULL(sensitivity_cstr);
487
488 // 26. Set collator.[[Sensitivity]] to sensitivity.
489 if (strcmp(sensitivity_cstr, "base") == 0) {
490 icu_collator->setStrength(icu::Collator::PRIMARY);
491 } else if (strcmp(sensitivity_cstr, "accent") == 0) {
492 icu_collator->setStrength(icu::Collator::SECONDARY);
493 } else if (strcmp(sensitivity_cstr, "case") == 0) {
494 icu_collator->setStrength(icu::Collator::PRIMARY);
495 status = U_ZERO_ERROR;
496 icu_collator->setAttribute(UCOL_CASE_LEVEL, UCOL_ON, status);
497 CHECK(U_SUCCESS(status));
498 } else {
499 DCHECK_EQ(0, strcmp(sensitivity_cstr, "variant"));
500 icu_collator->setStrength(icu::Collator::TERTIARY);
501 }
502 }
503
504 // 27.Let ignorePunctuation be ? GetOption(options,
505 // "ignorePunctuation", "boolean", undefined, false).
506 bool ignore_punctuation;
507 Maybe<bool> found_ignore_punctuation =
508 Intl::GetBoolOption(isolate, options, "ignorePunctuation",
509 "Intl.Collator", &ignore_punctuation);
510 MAYBE_RETURN(found_ignore_punctuation, MaybeHandle<JSCollator>());
511
512 // 28. Set collator.[[IgnorePunctuation]] to ignorePunctuation.
513 if (found_ignore_punctuation.FromJust() && ignore_punctuation) {
514 status = U_ZERO_ERROR;
515 icu_collator->setAttribute(UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, status);
516 CHECK(U_SUCCESS(status));
517 }
518
519 Handle<Managed<icu::Collator>> managed_collator =
520 Managed<icu::Collator>::FromUniquePtr(isolate, 0,
521 std::move(icu_collator));
522 collator->set_icu_collator(*managed_collator);
523
524 // 29. Return collator.
525 return collator;
526 }
527
528 // static
UsageToString(Usage usage)529 const char* JSCollator::UsageToString(Usage usage) {
530 switch (usage) {
531 case Usage::SORT:
532 return "sort";
533 case Usage::SEARCH:
534 return "search";
535 case Usage::COUNT:
536 UNREACHABLE();
537 }
538 }
539
540 } // namespace internal
541 } // namespace v8
542