1 // Copyright Joyent, Inc. and other Node contributors.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a
4 // copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to permit
8 // persons to whom the Software is furnished to do so, subject to the
9 // following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included
12 // in all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
21
22 /*
23 * notes: by srl295
24 * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25 * ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26 * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27 * macro names. That's the "english+root" data.
28 *
29 * If icu_data_path is non-null, the user has provided a path and we assume
30 * it goes somewhere useful. We set that path in ICU, and exit.
31 * If icu_data_path is null, they haven't set a path and we want the
32 * "english+root" data. We call
33 * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34 * to load up the english+root data.
35 *
36 * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37 * data. All of the variables and command line options for changing data at
38 * runtime are disabled, as they wouldn't fully override the internal data.
39 * See: http://bugs.icu-project.org/trac/ticket/10924
40 */
41
42
43 #include "node_i18n.h"
44
45 #if defined(NODE_HAVE_I18N_SUPPORT)
46
47 #include "base_object-inl.h"
48 #include "node.h"
49 #include "node_buffer.h"
50 #include "node_errors.h"
51 #include "node_internals.h"
52 #include "util-inl.h"
53 #include "v8.h"
54
55 #include <unicode/utypes.h>
56 #include <unicode/putil.h>
57 #include <unicode/uchar.h>
58 #include <unicode/uclean.h>
59 #include <unicode/udata.h>
60 #include <unicode/uidna.h>
61 #include <unicode/ucnv.h>
62 #include <unicode/utf8.h>
63 #include <unicode/utf16.h>
64 #include <unicode/timezone.h>
65 #include <unicode/ulocdata.h>
66 #include <unicode/uvernum.h>
67 #include <unicode/uversion.h>
68 #include <unicode/ustring.h>
69
70 #ifdef NODE_HAVE_SMALL_ICU
71 /* if this is defined, we have a 'secondary' entry point.
72 compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
73 #define SMALL_ICUDATA_ENTRY_POINT \
74 SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
75 #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
76 #ifndef U_LIB_SUFFIX_C_NAME
77 #define SMALL_DEF(major, suff) icusmdt##major##_dat
78 #else
79 #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
80 #endif
81
82 extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
83 #endif
84
85 namespace node {
86
87 using v8::Context;
88 using v8::FunctionCallbackInfo;
89 using v8::FunctionTemplate;
90 using v8::Int32;
91 using v8::Isolate;
92 using v8::Local;
93 using v8::MaybeLocal;
94 using v8::NewStringType;
95 using v8::Object;
96 using v8::ObjectTemplate;
97 using v8::String;
98 using v8::Uint8Array;
99 using v8::Value;
100
101 namespace i18n {
102 namespace {
103
104 template <typename T>
ToBufferEndian(Environment * env,MaybeStackBuffer<T> * buf)105 MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
106 MaybeLocal<Object> ret = Buffer::New(env, buf);
107 if (ret.IsEmpty())
108 return ret;
109
110 static_assert(sizeof(T) == 1 || sizeof(T) == 2,
111 "Currently only one- or two-byte buffers are supported");
112 if (sizeof(T) > 1 && IsBigEndian()) {
113 SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
114 SwapBytes16(retbuf_data, retbuf_length);
115 }
116
117 return ret;
118 }
119
120 // One-Shot Converters
121
CopySourceBuffer(MaybeStackBuffer<UChar> * dest,const char * data,const size_t length,const size_t length_in_chars)122 void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
123 const char* data,
124 const size_t length,
125 const size_t length_in_chars) {
126 dest->AllocateSufficientStorage(length_in_chars);
127 char* dst = reinterpret_cast<char*>(**dest);
128 memcpy(dst, data, length);
129 if (IsBigEndian()) {
130 SwapBytes16(dst, length);
131 }
132 }
133
134 typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
135 const char* fromEncoding,
136 const char* toEncoding,
137 const char* source,
138 const size_t source_length,
139 UErrorCode* status);
140
Transcode(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)141 MaybeLocal<Object> Transcode(Environment* env,
142 const char* fromEncoding,
143 const char* toEncoding,
144 const char* source,
145 const size_t source_length,
146 UErrorCode* status) {
147 *status = U_ZERO_ERROR;
148 MaybeLocal<Object> ret;
149 MaybeStackBuffer<char> result;
150 Converter to(toEncoding, "?");
151 Converter from(fromEncoding);
152 const uint32_t limit = source_length * to.max_char_size();
153 result.AllocateSufficientStorage(limit);
154 char* target = *result;
155 ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
156 &source, source + source_length, nullptr, nullptr,
157 nullptr, nullptr, true, true, status);
158 if (U_SUCCESS(*status)) {
159 result.SetLength(target - &result[0]);
160 ret = ToBufferEndian(env, &result);
161 }
162 return ret;
163 }
164
TranscodeToUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)165 MaybeLocal<Object> TranscodeToUcs2(Environment* env,
166 const char* fromEncoding,
167 const char* toEncoding,
168 const char* source,
169 const size_t source_length,
170 UErrorCode* status) {
171 *status = U_ZERO_ERROR;
172 MaybeLocal<Object> ret;
173 MaybeStackBuffer<UChar> destbuf(source_length);
174 Converter from(fromEncoding);
175 const size_t length_in_chars = source_length * sizeof(UChar);
176 ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
177 source, source_length, status);
178 if (U_SUCCESS(*status))
179 ret = ToBufferEndian(env, &destbuf);
180 return ret;
181 }
182
TranscodeFromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)183 MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
184 const char* fromEncoding,
185 const char* toEncoding,
186 const char* source,
187 const size_t source_length,
188 UErrorCode* status) {
189 *status = U_ZERO_ERROR;
190 MaybeStackBuffer<UChar> sourcebuf;
191 MaybeLocal<Object> ret;
192 Converter to(toEncoding, "?");
193 const size_t length_in_chars = source_length / sizeof(UChar);
194 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
195 MaybeStackBuffer<char> destbuf(length_in_chars);
196 const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
197 *sourcebuf, length_in_chars, status);
198 if (U_SUCCESS(*status)) {
199 destbuf.SetLength(len);
200 ret = ToBufferEndian(env, &destbuf);
201 }
202 return ret;
203 }
204
TranscodeUcs2FromUtf8(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)205 MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
206 const char* fromEncoding,
207 const char* toEncoding,
208 const char* source,
209 const size_t source_length,
210 UErrorCode* status) {
211 *status = U_ZERO_ERROR;
212 MaybeStackBuffer<UChar> destbuf;
213 int32_t result_length;
214 u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
215 source, source_length, status);
216 MaybeLocal<Object> ret;
217 if (U_SUCCESS(*status)) {
218 destbuf.SetLength(result_length);
219 ret = ToBufferEndian(env, &destbuf);
220 } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
221 *status = U_ZERO_ERROR;
222 destbuf.AllocateSufficientStorage(result_length);
223 u_strFromUTF8(*destbuf, result_length, &result_length,
224 source, source_length, status);
225 if (U_SUCCESS(*status)) {
226 destbuf.SetLength(result_length);
227 ret = ToBufferEndian(env, &destbuf);
228 }
229 }
230 return ret;
231 }
232
TranscodeUtf8FromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)233 MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
234 const char* fromEncoding,
235 const char* toEncoding,
236 const char* source,
237 const size_t source_length,
238 UErrorCode* status) {
239 *status = U_ZERO_ERROR;
240 MaybeLocal<Object> ret;
241 const size_t length_in_chars = source_length / sizeof(UChar);
242 int32_t result_length;
243 MaybeStackBuffer<UChar> sourcebuf;
244 MaybeStackBuffer<char> destbuf;
245 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
246 u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
247 *sourcebuf, length_in_chars, status);
248 if (U_SUCCESS(*status)) {
249 destbuf.SetLength(result_length);
250 ret = ToBufferEndian(env, &destbuf);
251 } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
252 *status = U_ZERO_ERROR;
253 destbuf.AllocateSufficientStorage(result_length);
254 u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
255 length_in_chars, status);
256 if (U_SUCCESS(*status)) {
257 destbuf.SetLength(result_length);
258 ret = ToBufferEndian(env, &destbuf);
259 }
260 }
261 return ret;
262 }
263
EncodingName(const enum encoding encoding)264 const char* EncodingName(const enum encoding encoding) {
265 switch (encoding) {
266 case ASCII: return "us-ascii";
267 case LATIN1: return "iso8859-1";
268 case UCS2: return "utf16le";
269 case UTF8: return "utf-8";
270 default: return nullptr;
271 }
272 }
273
SupportedEncoding(const enum encoding encoding)274 bool SupportedEncoding(const enum encoding encoding) {
275 switch (encoding) {
276 case ASCII:
277 case LATIN1:
278 case UCS2:
279 case UTF8: return true;
280 default: return false;
281 }
282 }
283
Transcode(const FunctionCallbackInfo<Value> & args)284 void Transcode(const FunctionCallbackInfo<Value>&args) {
285 Environment* env = Environment::GetCurrent(args);
286 Isolate* isolate = env->isolate();
287 UErrorCode status = U_ZERO_ERROR;
288 MaybeLocal<Object> result;
289
290 ArrayBufferViewContents<char> input(args[0]);
291 const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
292 const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
293
294 if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
295 TranscodeFunc tfn = &Transcode;
296 switch (fromEncoding) {
297 case ASCII:
298 case LATIN1:
299 if (toEncoding == UCS2)
300 tfn = &TranscodeToUcs2;
301 break;
302 case UTF8:
303 if (toEncoding == UCS2)
304 tfn = &TranscodeUcs2FromUtf8;
305 break;
306 case UCS2:
307 switch (toEncoding) {
308 case UCS2:
309 tfn = &Transcode;
310 break;
311 case UTF8:
312 tfn = &TranscodeUtf8FromUcs2;
313 break;
314 default:
315 tfn = &TranscodeFromUcs2;
316 }
317 break;
318 default:
319 // This should not happen because of the SupportedEncoding checks
320 ABORT();
321 }
322
323 result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
324 input.data(), input.length(), &status);
325 } else {
326 status = U_ILLEGAL_ARGUMENT_ERROR;
327 }
328
329 if (result.IsEmpty())
330 return args.GetReturnValue().Set(status);
331
332 return args.GetReturnValue().Set(result.ToLocalChecked());
333 }
334
ICUErrorName(const FunctionCallbackInfo<Value> & args)335 void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
336 Environment* env = Environment::GetCurrent(args);
337 CHECK(args[0]->IsInt32());
338 UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
339 args.GetReturnValue().Set(
340 String::NewFromUtf8(env->isolate(),
341 u_errorName(status),
342 NewStringType::kNormal).ToLocalChecked());
343 }
344
345 } // anonymous namespace
346
Converter(const char * name,const char * sub)347 Converter::Converter(const char* name, const char* sub) {
348 UErrorCode status = U_ZERO_ERROR;
349 UConverter* conv = ucnv_open(name, &status);
350 CHECK(U_SUCCESS(status));
351 conv_.reset(conv);
352 set_subst_chars(sub);
353 }
354
Converter(UConverter * converter,const char * sub)355 Converter::Converter(UConverter* converter, const char* sub)
356 : conv_(converter) {
357 set_subst_chars(sub);
358 }
359
set_subst_chars(const char * sub)360 void Converter::set_subst_chars(const char* sub) {
361 CHECK(conv_);
362 UErrorCode status = U_ZERO_ERROR;
363 if (sub != nullptr) {
364 ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
365 CHECK(U_SUCCESS(status));
366 }
367 }
368
reset()369 void Converter::reset() {
370 ucnv_reset(conv_.get());
371 }
372
min_char_size() const373 size_t Converter::min_char_size() const {
374 CHECK(conv_);
375 return ucnv_getMinCharSize(conv_.get());
376 }
377
max_char_size() const378 size_t Converter::max_char_size() const {
379 CHECK(conv_);
380 return ucnv_getMaxCharSize(conv_.get());
381 }
382
Has(const FunctionCallbackInfo<Value> & args)383 void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
384 Environment* env = Environment::GetCurrent(args);
385
386 CHECK_GE(args.Length(), 1);
387 Utf8Value label(env->isolate(), args[0]);
388
389 UErrorCode status = U_ZERO_ERROR;
390 ConverterPointer conv(ucnv_open(*label, &status));
391 args.GetReturnValue().Set(!!U_SUCCESS(status));
392 }
393
Create(const FunctionCallbackInfo<Value> & args)394 void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
395 Environment* env = Environment::GetCurrent(args);
396
397 Local<ObjectTemplate> t = env->i18n_converter_template();
398 Local<Object> obj;
399 if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
400
401 CHECK_GE(args.Length(), 2);
402 Utf8Value label(env->isolate(), args[0]);
403 int flags = args[1]->Uint32Value(env->context()).ToChecked();
404 bool fatal =
405 (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
406
407 UErrorCode status = U_ZERO_ERROR;
408 UConverter* conv = ucnv_open(*label, &status);
409 if (U_FAILURE(status))
410 return;
411
412 if (fatal) {
413 status = U_ZERO_ERROR;
414 ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
415 nullptr, nullptr, nullptr, &status);
416 }
417
418 new ConverterObject(env, obj, conv, flags);
419 args.GetReturnValue().Set(obj);
420 }
421
Decode(const FunctionCallbackInfo<Value> & args)422 void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
423 Environment* env = Environment::GetCurrent(args);
424
425 CHECK_GE(args.Length(), 3); // Converter, Buffer, Flags
426
427 ConverterObject* converter;
428 ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
429 ArrayBufferViewContents<char> input(args[1]);
430 int flags = args[2]->Uint32Value(env->context()).ToChecked();
431
432 UErrorCode status = U_ZERO_ERROR;
433 MaybeStackBuffer<UChar> result;
434 MaybeLocal<Object> ret;
435 size_t limit = converter->min_char_size() * input.length();
436 if (limit > 0)
437 result.AllocateSufficientStorage(limit);
438
439 UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
440 auto cleanup = OnScopeLeave([&]() {
441 if (flush) {
442 // Reset the converter state.
443 converter->set_bom_seen(false);
444 converter->reset();
445 }
446 });
447
448 const char* source = input.data();
449 size_t source_length = input.length();
450
451 UChar* target = *result;
452 ucnv_toUnicode(converter->conv(),
453 &target,
454 target + (limit * sizeof(UChar)),
455 &source,
456 source + source_length,
457 nullptr,
458 flush,
459 &status);
460
461 if (U_SUCCESS(status)) {
462 bool omit_initial_bom = false;
463 if (limit > 0) {
464 result.SetLength(target - &result[0]);
465 if (result.length() > 0 &&
466 converter->unicode() &&
467 !converter->ignore_bom() &&
468 !converter->bom_seen()) {
469 // If the very first result in the stream is a BOM, and we are not
470 // explicitly told to ignore it, then we mark it for discarding.
471 if (result[0] == 0xFEFF)
472 omit_initial_bom = true;
473 converter->set_bom_seen(true);
474 }
475 }
476 ret = ToBufferEndian(env, &result);
477 if (omit_initial_bom && !ret.IsEmpty()) {
478 // Peform `ret = ret.slice(2)`.
479 CHECK(ret.ToLocalChecked()->IsUint8Array());
480 Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
481 ret = Buffer::New(env,
482 orig_ret->Buffer(),
483 orig_ret->ByteOffset() + 2,
484 orig_ret->ByteLength() - 2)
485 .FromMaybe(Local<Uint8Array>());
486 }
487 if (!ret.IsEmpty())
488 args.GetReturnValue().Set(ret.ToLocalChecked());
489 return;
490 }
491
492 args.GetReturnValue().Set(status);
493 }
494
ConverterObject(Environment * env,Local<Object> wrap,UConverter * converter,int flags,const char * sub)495 ConverterObject::ConverterObject(
496 Environment* env,
497 Local<Object> wrap,
498 UConverter* converter,
499 int flags,
500 const char* sub)
501 : BaseObject(env, wrap),
502 Converter(converter, sub),
503 flags_(flags) {
504 MakeWeak();
505
506 switch (ucnv_getType(converter)) {
507 case UCNV_UTF8:
508 case UCNV_UTF16_BigEndian:
509 case UCNV_UTF16_LittleEndian:
510 flags_ |= CONVERTER_FLAGS_UNICODE;
511 break;
512 default: {
513 // Fall through
514 }
515 }
516 }
517
518
InitializeICUDirectory(const std::string & path)519 bool InitializeICUDirectory(const std::string& path) {
520 UErrorCode status = U_ZERO_ERROR;
521 if (path.empty()) {
522 #ifdef NODE_HAVE_SMALL_ICU
523 // install the 'small' data.
524 udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
525 #else // !NODE_HAVE_SMALL_ICU
526 // no small data, so nothing to do.
527 #endif // !NODE_HAVE_SMALL_ICU
528 } else {
529 u_setDataDirectory(path.c_str());
530 u_init(&status);
531 }
532 return status == U_ZERO_ERROR;
533 }
534
ToUnicode(MaybeStackBuffer<char> * buf,const char * input,size_t length)535 int32_t ToUnicode(MaybeStackBuffer<char>* buf,
536 const char* input,
537 size_t length) {
538 UErrorCode status = U_ZERO_ERROR;
539 uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
540 UIDNA* uidna = uidna_openUTS46(options, &status);
541 if (U_FAILURE(status))
542 return -1;
543 UIDNAInfo info = UIDNA_INFO_INITIALIZER;
544
545 int32_t len = uidna_nameToUnicodeUTF8(uidna,
546 input, length,
547 **buf, buf->capacity(),
548 &info,
549 &status);
550
551 // Do not check info.errors like we do with ToASCII since ToUnicode always
552 // returns a string, despite any possible errors that may have occurred.
553
554 if (status == U_BUFFER_OVERFLOW_ERROR) {
555 status = U_ZERO_ERROR;
556 buf->AllocateSufficientStorage(len);
557 len = uidna_nameToUnicodeUTF8(uidna,
558 input, length,
559 **buf, buf->capacity(),
560 &info,
561 &status);
562 }
563
564 // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
565 // string, regardless of whether an error occurred.
566
567 if (U_FAILURE(status)) {
568 len = -1;
569 buf->SetLength(0);
570 } else {
571 buf->SetLength(len);
572 }
573
574 uidna_close(uidna);
575 return len;
576 }
577
ToASCII(MaybeStackBuffer<char> * buf,const char * input,size_t length,enum idna_mode mode)578 int32_t ToASCII(MaybeStackBuffer<char>* buf,
579 const char* input,
580 size_t length,
581 enum idna_mode mode) {
582 UErrorCode status = U_ZERO_ERROR;
583 uint32_t options = // CheckHyphens = false; handled later
584 UIDNA_CHECK_BIDI | // CheckBidi = true
585 UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true
586 UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing
587 if (mode == IDNA_STRICT) {
588 options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict
589 // VerifyDnsLength = beStrict;
590 // handled later
591 }
592
593 UIDNA* uidna = uidna_openUTS46(options, &status);
594 if (U_FAILURE(status))
595 return -1;
596 UIDNAInfo info = UIDNA_INFO_INITIALIZER;
597
598 int32_t len = uidna_nameToASCII_UTF8(uidna,
599 input, length,
600 **buf, buf->capacity(),
601 &info,
602 &status);
603
604 if (status == U_BUFFER_OVERFLOW_ERROR) {
605 status = U_ZERO_ERROR;
606 buf->AllocateSufficientStorage(len);
607 len = uidna_nameToASCII_UTF8(uidna,
608 input, length,
609 **buf, buf->capacity(),
610 &info,
611 &status);
612 }
613
614 // In UTS #46 which specifies ToASCII, certain error conditions are
615 // configurable through options, and the WHATWG URL Standard promptly elects
616 // to disable some of them to accommodate for real-world use cases.
617 // Unfortunately, ICU4C's IDNA module does not support disabling some of
618 // these options through `options` above, and thus continues throwing
619 // unnecessary errors. To counter this situation, we just filter out the
620 // errors that may have happened afterwards, before deciding whether to
621 // return an error from this function.
622
623 // CheckHyphens = false
624 // (Specified in the current UTS #46 draft rev. 18.)
625 // Refs:
626 // - https://github.com/whatwg/url/issues/53
627 // - https://github.com/whatwg/url/pull/309
628 // - http://www.unicode.org/review/pri317/
629 // - http://www.unicode.org/reports/tr46/tr46-18.html
630 // - https://www.icann.org/news/announcement-2000-01-07-en
631 info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
632 info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
633 info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
634
635 if (mode != IDNA_STRICT) {
636 // VerifyDnsLength = beStrict
637 info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
638 info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
639 info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
640 }
641
642 if (U_FAILURE(status) || (mode != IDNA_LENIENT && info.errors != 0)) {
643 len = -1;
644 buf->SetLength(0);
645 } else {
646 buf->SetLength(len);
647 }
648
649 uidna_close(uidna);
650 return len;
651 }
652
ToUnicode(const FunctionCallbackInfo<Value> & args)653 static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
654 Environment* env = Environment::GetCurrent(args);
655 CHECK_GE(args.Length(), 1);
656 CHECK(args[0]->IsString());
657 Utf8Value val(env->isolate(), args[0]);
658
659 MaybeStackBuffer<char> buf;
660 int32_t len = ToUnicode(&buf, *val, val.length());
661
662 if (len < 0) {
663 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
664 }
665
666 args.GetReturnValue().Set(
667 String::NewFromUtf8(env->isolate(),
668 *buf,
669 NewStringType::kNormal,
670 len).ToLocalChecked());
671 }
672
ToASCII(const FunctionCallbackInfo<Value> & args)673 static void ToASCII(const FunctionCallbackInfo<Value>& args) {
674 Environment* env = Environment::GetCurrent(args);
675 CHECK_GE(args.Length(), 1);
676 CHECK(args[0]->IsString());
677 Utf8Value val(env->isolate(), args[0]);
678 // optional arg
679 bool lenient = args[1]->BooleanValue(env->isolate());
680 enum idna_mode mode = lenient ? IDNA_LENIENT : IDNA_DEFAULT;
681
682 MaybeStackBuffer<char> buf;
683 int32_t len = ToASCII(&buf, *val, val.length(), mode);
684
685 if (len < 0) {
686 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
687 }
688
689 args.GetReturnValue().Set(
690 String::NewFromUtf8(env->isolate(),
691 *buf,
692 NewStringType::kNormal,
693 len).ToLocalChecked());
694 }
695
696 // This is similar to wcwidth except that it takes the current unicode
697 // character properties database into consideration, allowing it to
698 // correctly calculate the column widths of things like emoji's and
699 // newer wide characters. wcwidth, on the other hand, uses a fixed
700 // algorithm that does not take things like emoji into proper
701 // consideration.
702 //
703 // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
704 // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
705 // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
706 // allow it to be input. Linux's PTY terminal prints control characters as
707 // Narrow rhombi.
708 //
709 // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
710 // consonants are 0-width when combined with initial consonants; otherwise they
711 // are technically Wide. But many terminals (including Konsole and
712 // VTE/GLib-based) implement all medials and finals as 0-width.
713 //
714 // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
715 // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
716 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
GetColumnWidth(UChar32 codepoint,bool ambiguous_as_full_width=false)717 static int GetColumnWidth(UChar32 codepoint,
718 bool ambiguous_as_full_width = false) {
719 // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
720 // codepoint as being full width, wide, ambiguous, neutral, narrow,
721 // or halfwidth.
722 const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
723 switch (eaw) {
724 case U_EA_FULLWIDTH:
725 case U_EA_WIDE:
726 return 2;
727 case U_EA_AMBIGUOUS:
728 // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
729 if (ambiguous_as_full_width) {
730 return 2;
731 }
732 // If ambiguous_as_full_width is false:
733 // Fall through
734 case U_EA_NEUTRAL:
735 if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
736 return 2;
737 }
738 // Fall through
739 case U_EA_HALFWIDTH:
740 case U_EA_NARROW:
741 default:
742 const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code
743 U_GC_CF_MASK | // Format control character
744 U_GC_ME_MASK | // Enclosing mark
745 U_GC_MN_MASK; // Nonspacing mark
746 if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width
747 ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
748 u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
749 return 0;
750 }
751 return 1;
752 }
753 }
754
755 // Returns the column width for the given String.
GetStringWidth(const FunctionCallbackInfo<Value> & args)756 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
757 Environment* env = Environment::GetCurrent(args);
758 CHECK(args[0]->IsString());
759
760 bool ambiguous_as_full_width = args[1]->IsTrue();
761 bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
762
763 TwoByteValue value(env->isolate(), args[0]);
764 // reinterpret_cast is required by windows to compile
765 UChar* str = reinterpret_cast<UChar*>(*value);
766 static_assert(sizeof(*str) == sizeof(**value),
767 "sizeof(*str) == sizeof(**value)");
768 UChar32 c = 0;
769 UChar32 p;
770 size_t n = 0;
771 uint32_t width = 0;
772
773 while (n < value.length()) {
774 p = c;
775 U16_NEXT(str, n, value.length(), c);
776 // Don't count individual emoji codepoints that occur within an
777 // emoji sequence. This is not necessarily foolproof. Some
778 // environments display emoji sequences in the appropriate
779 // condensed form (as a single emoji glyph), other environments
780 // may not understand an emoji sequence and will display each
781 // individual emoji separately. When this happens, the width
782 // calculated will be off, and there's no reliable way of knowing
783 // in advance if a particular sequence is going to be supported.
784 // The expand_emoji_sequence option allows the caller to skip this
785 // check and count each code within an emoji sequence separately.
786 // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
787 if (!expand_emoji_sequence &&
788 n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner)
789 (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
790 u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
791 continue;
792 }
793 width += GetColumnWidth(c, ambiguous_as_full_width);
794 }
795 args.GetReturnValue().Set(width);
796 }
797
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)798 void Initialize(Local<Object> target,
799 Local<Value> unused,
800 Local<Context> context,
801 void* priv) {
802 Environment* env = Environment::GetCurrent(context);
803 env->SetMethod(target, "toUnicode", ToUnicode);
804 env->SetMethod(target, "toASCII", ToASCII);
805 env->SetMethod(target, "getStringWidth", GetStringWidth);
806
807 // One-shot converters
808 env->SetMethod(target, "icuErrName", ICUErrorName);
809 env->SetMethod(target, "transcode", Transcode);
810
811 // ConverterObject
812 {
813 Local<FunctionTemplate> t = FunctionTemplate::New(env->isolate());
814 t->Inherit(BaseObject::GetConstructorTemplate(env));
815 t->InstanceTemplate()->SetInternalFieldCount(
816 ConverterObject::kInternalFieldCount);
817 Local<String> converter_string =
818 FIXED_ONE_BYTE_STRING(env->isolate(), "Converter");
819 t->SetClassName(converter_string);
820 env->set_i18n_converter_template(t->InstanceTemplate());
821 }
822
823 env->SetMethod(target, "getConverter", ConverterObject::Create);
824 env->SetMethod(target, "decode", ConverterObject::Decode);
825 env->SetMethod(target, "hasConverter", ConverterObject::Has);
826 }
827
828 } // namespace i18n
829 } // namespace node
830
831 NODE_MODULE_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize)
832
833 #endif // NODE_HAVE_I18N_SUPPORT
834