1 // Copyright Joyent, Inc. and other Node contributors.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a
4 // copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to permit
8 // persons to whom the Software is furnished to do so, subject to the
9 // following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included
12 // in all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
21
22 /*
23 * notes: by srl295
24 * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25 * ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26 * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27 * macro names. That's the "english+root" data.
28 *
29 * If icu_data_path is non-null, the user has provided a path and we assume
30 * it goes somewhere useful. We set that path in ICU, and exit.
31 * If icu_data_path is null, they haven't set a path and we want the
32 * "english+root" data. We call
33 * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34 * to load up the english+root data.
35 *
36 * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37 * data. All of the variables and command line options for changing data at
38 * runtime are disabled, as they wouldn't fully override the internal data.
39 * See: http://bugs.icu-project.org/trac/ticket/10924
40 */
41
42
43 #include "node_i18n.h"
44
45 #if defined(NODE_HAVE_I18N_SUPPORT)
46
47 #include "base_object-inl.h"
48 #include "node.h"
49 #include "node_buffer.h"
50 #include "node_errors.h"
51 #include "node_internals.h"
52 #include "util-inl.h"
53 #include "v8.h"
54
55 #include <unicode/utypes.h>
56 #include <unicode/putil.h>
57 #include <unicode/uchar.h>
58 #include <unicode/uclean.h>
59 #include <unicode/udata.h>
60 #include <unicode/uidna.h>
61 #include <unicode/ucnv.h>
62 #include <unicode/utf8.h>
63 #include <unicode/utf16.h>
64 #include <unicode/timezone.h>
65 #include <unicode/ulocdata.h>
66 #include <unicode/uvernum.h>
67 #include <unicode/uversion.h>
68 #include <unicode/ustring.h>
69
70 #ifdef NODE_HAVE_SMALL_ICU
71 /* if this is defined, we have a 'secondary' entry point.
72 compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
73 #define SMALL_ICUDATA_ENTRY_POINT \
74 SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
75 #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
76 #ifndef U_LIB_SUFFIX_C_NAME
77 #define SMALL_DEF(major, suff) icusmdt##major##_dat
78 #else
79 #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
80 #endif
81
82 extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
83 #endif
84
85 namespace node {
86
87 using v8::Context;
88 using v8::FunctionCallbackInfo;
89 using v8::FunctionTemplate;
90 using v8::Int32;
91 using v8::Isolate;
92 using v8::Local;
93 using v8::MaybeLocal;
94 using v8::NewStringType;
95 using v8::Object;
96 using v8::ObjectTemplate;
97 using v8::String;
98 using v8::Uint8Array;
99 using v8::Value;
100
101 namespace i18n {
102 namespace {
103
104 template <typename T>
ToBufferEndian(Environment * env,MaybeStackBuffer<T> * buf)105 MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
106 MaybeLocal<Object> ret = Buffer::New(env, buf);
107 if (ret.IsEmpty())
108 return ret;
109
110 static_assert(sizeof(T) == 1 || sizeof(T) == 2,
111 "Currently only one- or two-byte buffers are supported");
112 if (sizeof(T) > 1 && IsBigEndian()) {
113 SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
114 SwapBytes16(retbuf_data, retbuf_length);
115 }
116
117 return ret;
118 }
119
120 // One-Shot Converters
121
CopySourceBuffer(MaybeStackBuffer<UChar> * dest,const char * data,const size_t length,const size_t length_in_chars)122 void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
123 const char* data,
124 const size_t length,
125 const size_t length_in_chars) {
126 dest->AllocateSufficientStorage(length_in_chars);
127 char* dst = reinterpret_cast<char*>(**dest);
128 memcpy(dst, data, length);
129 if (IsBigEndian()) {
130 SwapBytes16(dst, length);
131 }
132 }
133
134 typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
135 const char* fromEncoding,
136 const char* toEncoding,
137 const char* source,
138 const size_t source_length,
139 UErrorCode* status);
140
Transcode(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)141 MaybeLocal<Object> Transcode(Environment* env,
142 const char* fromEncoding,
143 const char* toEncoding,
144 const char* source,
145 const size_t source_length,
146 UErrorCode* status) {
147 *status = U_ZERO_ERROR;
148 MaybeLocal<Object> ret;
149 MaybeStackBuffer<char> result;
150 Converter to(toEncoding);
151 Converter from(fromEncoding);
152
153 size_t sublen = ucnv_getMinCharSize(to.conv());
154 std::string sub(sublen, '?');
155 to.set_subst_chars(sub.c_str());
156
157 const uint32_t limit = source_length * to.max_char_size();
158 result.AllocateSufficientStorage(limit);
159 char* target = *result;
160 ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
161 &source, source + source_length, nullptr, nullptr,
162 nullptr, nullptr, true, true, status);
163 if (U_SUCCESS(*status)) {
164 result.SetLength(target - &result[0]);
165 ret = ToBufferEndian(env, &result);
166 }
167 return ret;
168 }
169
TranscodeToUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)170 MaybeLocal<Object> TranscodeToUcs2(Environment* env,
171 const char* fromEncoding,
172 const char* toEncoding,
173 const char* source,
174 const size_t source_length,
175 UErrorCode* status) {
176 *status = U_ZERO_ERROR;
177 MaybeLocal<Object> ret;
178 MaybeStackBuffer<UChar> destbuf(source_length);
179 Converter from(fromEncoding);
180 const size_t length_in_chars = source_length * sizeof(UChar);
181 ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
182 source, source_length, status);
183 if (U_SUCCESS(*status))
184 ret = ToBufferEndian(env, &destbuf);
185 return ret;
186 }
187
TranscodeFromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)188 MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
189 const char* fromEncoding,
190 const char* toEncoding,
191 const char* source,
192 const size_t source_length,
193 UErrorCode* status) {
194 *status = U_ZERO_ERROR;
195 MaybeStackBuffer<UChar> sourcebuf;
196 MaybeLocal<Object> ret;
197 Converter to(toEncoding);
198
199 size_t sublen = ucnv_getMinCharSize(to.conv());
200 std::string sub(sublen, '?');
201 to.set_subst_chars(sub.c_str());
202
203 const size_t length_in_chars = source_length / sizeof(UChar);
204 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
205 MaybeStackBuffer<char> destbuf(length_in_chars);
206 const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
207 *sourcebuf, length_in_chars, status);
208 if (U_SUCCESS(*status)) {
209 destbuf.SetLength(len);
210 ret = ToBufferEndian(env, &destbuf);
211 }
212 return ret;
213 }
214
TranscodeUcs2FromUtf8(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)215 MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
216 const char* fromEncoding,
217 const char* toEncoding,
218 const char* source,
219 const size_t source_length,
220 UErrorCode* status) {
221 *status = U_ZERO_ERROR;
222 MaybeStackBuffer<UChar> destbuf;
223 int32_t result_length;
224 u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
225 source, source_length, status);
226 MaybeLocal<Object> ret;
227 if (U_SUCCESS(*status)) {
228 destbuf.SetLength(result_length);
229 ret = ToBufferEndian(env, &destbuf);
230 } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
231 *status = U_ZERO_ERROR;
232 destbuf.AllocateSufficientStorage(result_length);
233 u_strFromUTF8(*destbuf, result_length, &result_length,
234 source, source_length, status);
235 if (U_SUCCESS(*status)) {
236 destbuf.SetLength(result_length);
237 ret = ToBufferEndian(env, &destbuf);
238 }
239 }
240 return ret;
241 }
242
TranscodeUtf8FromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)243 MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
244 const char* fromEncoding,
245 const char* toEncoding,
246 const char* source,
247 const size_t source_length,
248 UErrorCode* status) {
249 *status = U_ZERO_ERROR;
250 MaybeLocal<Object> ret;
251 const size_t length_in_chars = source_length / sizeof(UChar);
252 int32_t result_length;
253 MaybeStackBuffer<UChar> sourcebuf;
254 MaybeStackBuffer<char> destbuf;
255 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
256 u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
257 *sourcebuf, length_in_chars, status);
258 if (U_SUCCESS(*status)) {
259 destbuf.SetLength(result_length);
260 ret = ToBufferEndian(env, &destbuf);
261 } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
262 *status = U_ZERO_ERROR;
263 destbuf.AllocateSufficientStorage(result_length);
264 u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
265 length_in_chars, status);
266 if (U_SUCCESS(*status)) {
267 destbuf.SetLength(result_length);
268 ret = ToBufferEndian(env, &destbuf);
269 }
270 }
271 return ret;
272 }
273
EncodingName(const enum encoding encoding)274 const char* EncodingName(const enum encoding encoding) {
275 switch (encoding) {
276 case ASCII: return "us-ascii";
277 case LATIN1: return "iso8859-1";
278 case UCS2: return "utf16le";
279 case UTF8: return "utf-8";
280 default: return nullptr;
281 }
282 }
283
SupportedEncoding(const enum encoding encoding)284 bool SupportedEncoding(const enum encoding encoding) {
285 switch (encoding) {
286 case ASCII:
287 case LATIN1:
288 case UCS2:
289 case UTF8: return true;
290 default: return false;
291 }
292 }
293
Transcode(const FunctionCallbackInfo<Value> & args)294 void Transcode(const FunctionCallbackInfo<Value>&args) {
295 Environment* env = Environment::GetCurrent(args);
296 Isolate* isolate = env->isolate();
297 UErrorCode status = U_ZERO_ERROR;
298 MaybeLocal<Object> result;
299
300 ArrayBufferViewContents<char> input(args[0]);
301 const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
302 const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
303
304 if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
305 TranscodeFunc tfn = &Transcode;
306 switch (fromEncoding) {
307 case ASCII:
308 case LATIN1:
309 if (toEncoding == UCS2)
310 tfn = &TranscodeToUcs2;
311 break;
312 case UTF8:
313 if (toEncoding == UCS2)
314 tfn = &TranscodeUcs2FromUtf8;
315 break;
316 case UCS2:
317 switch (toEncoding) {
318 case UCS2:
319 tfn = &Transcode;
320 break;
321 case UTF8:
322 tfn = &TranscodeUtf8FromUcs2;
323 break;
324 default:
325 tfn = &TranscodeFromUcs2;
326 }
327 break;
328 default:
329 // This should not happen because of the SupportedEncoding checks
330 ABORT();
331 }
332
333 result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
334 input.data(), input.length(), &status);
335 } else {
336 status = U_ILLEGAL_ARGUMENT_ERROR;
337 }
338
339 if (result.IsEmpty())
340 return args.GetReturnValue().Set(status);
341
342 return args.GetReturnValue().Set(result.ToLocalChecked());
343 }
344
ICUErrorName(const FunctionCallbackInfo<Value> & args)345 void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
346 Environment* env = Environment::GetCurrent(args);
347 CHECK(args[0]->IsInt32());
348 UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
349 args.GetReturnValue().Set(
350 String::NewFromUtf8(env->isolate(),
351 u_errorName(status)).ToLocalChecked());
352 }
353
354 } // anonymous namespace
355
Converter(const char * name,const char * sub)356 Converter::Converter(const char* name, const char* sub) {
357 UErrorCode status = U_ZERO_ERROR;
358 UConverter* conv = ucnv_open(name, &status);
359 CHECK(U_SUCCESS(status));
360 conv_.reset(conv);
361 set_subst_chars(sub);
362 }
363
Converter(UConverter * converter,const char * sub)364 Converter::Converter(UConverter* converter, const char* sub)
365 : conv_(converter) {
366 set_subst_chars(sub);
367 }
368
set_subst_chars(const char * sub)369 void Converter::set_subst_chars(const char* sub) {
370 CHECK(conv_);
371 UErrorCode status = U_ZERO_ERROR;
372 if (sub != nullptr) {
373 ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
374 CHECK(U_SUCCESS(status));
375 }
376 }
377
reset()378 void Converter::reset() {
379 ucnv_reset(conv_.get());
380 }
381
min_char_size() const382 size_t Converter::min_char_size() const {
383 CHECK(conv_);
384 return ucnv_getMinCharSize(conv_.get());
385 }
386
max_char_size() const387 size_t Converter::max_char_size() const {
388 CHECK(conv_);
389 return ucnv_getMaxCharSize(conv_.get());
390 }
391
Has(const FunctionCallbackInfo<Value> & args)392 void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
393 Environment* env = Environment::GetCurrent(args);
394
395 CHECK_GE(args.Length(), 1);
396 Utf8Value label(env->isolate(), args[0]);
397
398 UErrorCode status = U_ZERO_ERROR;
399 ConverterPointer conv(ucnv_open(*label, &status));
400 args.GetReturnValue().Set(!!U_SUCCESS(status));
401 }
402
Create(const FunctionCallbackInfo<Value> & args)403 void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
404 Environment* env = Environment::GetCurrent(args);
405
406 Local<ObjectTemplate> t = env->i18n_converter_template();
407 Local<Object> obj;
408 if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
409
410 CHECK_GE(args.Length(), 2);
411 Utf8Value label(env->isolate(), args[0]);
412 int flags = args[1]->Uint32Value(env->context()).ToChecked();
413 bool fatal =
414 (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
415
416 UErrorCode status = U_ZERO_ERROR;
417 UConverter* conv = ucnv_open(*label, &status);
418 if (U_FAILURE(status))
419 return;
420
421 if (fatal) {
422 status = U_ZERO_ERROR;
423 ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
424 nullptr, nullptr, nullptr, &status);
425 }
426
427 new ConverterObject(env, obj, conv, flags);
428 args.GetReturnValue().Set(obj);
429 }
430
Decode(const FunctionCallbackInfo<Value> & args)431 void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
432 Environment* env = Environment::GetCurrent(args);
433
434 CHECK_GE(args.Length(), 3); // Converter, Buffer, Flags
435
436 ConverterObject* converter;
437 ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
438 ArrayBufferViewContents<char> input(args[1]);
439 int flags = args[2]->Uint32Value(env->context()).ToChecked();
440
441 UErrorCode status = U_ZERO_ERROR;
442 MaybeStackBuffer<UChar> result;
443 MaybeLocal<Object> ret;
444
445 UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
446
447 // When flushing the final chunk, the limit is the maximum
448 // of either the input buffer length or the number of pending
449 // characters times the min char size.
450 size_t limit = converter->min_char_size() *
451 (!flush ?
452 input.length() :
453 std::max(
454 input.length(),
455 static_cast<size_t>(
456 ucnv_toUCountPending(converter->conv(), &status))));
457 status = U_ZERO_ERROR;
458
459 if (limit > 0)
460 result.AllocateSufficientStorage(limit);
461
462 auto cleanup = OnScopeLeave([&]() {
463 if (flush) {
464 // Reset the converter state.
465 converter->set_bom_seen(false);
466 converter->reset();
467 }
468 });
469
470 const char* source = input.data();
471 size_t source_length = input.length();
472
473 UChar* target = *result;
474 ucnv_toUnicode(converter->conv(),
475 &target,
476 target + (limit * sizeof(UChar)),
477 &source,
478 source + source_length,
479 nullptr,
480 flush,
481 &status);
482
483 if (U_SUCCESS(status)) {
484 bool omit_initial_bom = false;
485 if (limit > 0) {
486 result.SetLength(target - &result[0]);
487 if (result.length() > 0 &&
488 converter->unicode() &&
489 !converter->ignore_bom() &&
490 !converter->bom_seen()) {
491 // If the very first result in the stream is a BOM, and we are not
492 // explicitly told to ignore it, then we mark it for discarding.
493 if (result[0] == 0xFEFF)
494 omit_initial_bom = true;
495 converter->set_bom_seen(true);
496 }
497 }
498 ret = ToBufferEndian(env, &result);
499 if (omit_initial_bom && !ret.IsEmpty()) {
500 // Perform `ret = ret.slice(2)`.
501 CHECK(ret.ToLocalChecked()->IsUint8Array());
502 Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
503 ret = Buffer::New(env,
504 orig_ret->Buffer(),
505 orig_ret->ByteOffset() + 2,
506 orig_ret->ByteLength() - 2)
507 .FromMaybe(Local<Uint8Array>());
508 }
509 if (!ret.IsEmpty())
510 args.GetReturnValue().Set(ret.ToLocalChecked());
511 return;
512 }
513
514 args.GetReturnValue().Set(status);
515 }
516
ConverterObject(Environment * env,Local<Object> wrap,UConverter * converter,int flags,const char * sub)517 ConverterObject::ConverterObject(
518 Environment* env,
519 Local<Object> wrap,
520 UConverter* converter,
521 int flags,
522 const char* sub)
523 : BaseObject(env, wrap),
524 Converter(converter, sub),
525 flags_(flags) {
526 MakeWeak();
527
528 switch (ucnv_getType(converter)) {
529 case UCNV_UTF8:
530 case UCNV_UTF16_BigEndian:
531 case UCNV_UTF16_LittleEndian:
532 flags_ |= CONVERTER_FLAGS_UNICODE;
533 break;
534 default: {
535 // Fall through
536 }
537 }
538 }
539
540
InitializeICUDirectory(const std::string & path)541 bool InitializeICUDirectory(const std::string& path) {
542 UErrorCode status = U_ZERO_ERROR;
543 if (path.empty()) {
544 #ifdef NODE_HAVE_SMALL_ICU
545 // install the 'small' data.
546 udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
547 #else // !NODE_HAVE_SMALL_ICU
548 // no small data, so nothing to do.
549 #endif // !NODE_HAVE_SMALL_ICU
550 } else {
551 u_setDataDirectory(path.c_str());
552 u_init(&status);
553 }
554 return status == U_ZERO_ERROR;
555 }
556
ToUnicode(MaybeStackBuffer<char> * buf,const char * input,size_t length)557 int32_t ToUnicode(MaybeStackBuffer<char>* buf,
558 const char* input,
559 size_t length) {
560 UErrorCode status = U_ZERO_ERROR;
561 uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
562 UIDNA* uidna = uidna_openUTS46(options, &status);
563 if (U_FAILURE(status))
564 return -1;
565 UIDNAInfo info = UIDNA_INFO_INITIALIZER;
566
567 int32_t len = uidna_nameToUnicodeUTF8(uidna,
568 input, length,
569 **buf, buf->capacity(),
570 &info,
571 &status);
572
573 // Do not check info.errors like we do with ToASCII since ToUnicode always
574 // returns a string, despite any possible errors that may have occurred.
575
576 if (status == U_BUFFER_OVERFLOW_ERROR) {
577 status = U_ZERO_ERROR;
578 buf->AllocateSufficientStorage(len);
579 len = uidna_nameToUnicodeUTF8(uidna,
580 input, length,
581 **buf, buf->capacity(),
582 &info,
583 &status);
584 }
585
586 // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
587 // string, regardless of whether an error occurred.
588
589 if (U_FAILURE(status)) {
590 len = -1;
591 buf->SetLength(0);
592 } else {
593 buf->SetLength(len);
594 }
595
596 uidna_close(uidna);
597 return len;
598 }
599
ToASCII(MaybeStackBuffer<char> * buf,const char * input,size_t length,enum idna_mode mode)600 int32_t ToASCII(MaybeStackBuffer<char>* buf,
601 const char* input,
602 size_t length,
603 enum idna_mode mode) {
604 UErrorCode status = U_ZERO_ERROR;
605 uint32_t options = // CheckHyphens = false; handled later
606 UIDNA_CHECK_BIDI | // CheckBidi = true
607 UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true
608 UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing
609 if (mode == IDNA_STRICT) {
610 options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict
611 // VerifyDnsLength = beStrict;
612 // handled later
613 }
614
615 UIDNA* uidna = uidna_openUTS46(options, &status);
616 if (U_FAILURE(status))
617 return -1;
618 UIDNAInfo info = UIDNA_INFO_INITIALIZER;
619
620 int32_t len = uidna_nameToASCII_UTF8(uidna,
621 input, length,
622 **buf, buf->capacity(),
623 &info,
624 &status);
625
626 if (status == U_BUFFER_OVERFLOW_ERROR) {
627 status = U_ZERO_ERROR;
628 buf->AllocateSufficientStorage(len);
629 len = uidna_nameToASCII_UTF8(uidna,
630 input, length,
631 **buf, buf->capacity(),
632 &info,
633 &status);
634 }
635
636 // In UTS #46 which specifies ToASCII, certain error conditions are
637 // configurable through options, and the WHATWG URL Standard promptly elects
638 // to disable some of them to accommodate for real-world use cases.
639 // Unfortunately, ICU4C's IDNA module does not support disabling some of
640 // these options through `options` above, and thus continues throwing
641 // unnecessary errors. To counter this situation, we just filter out the
642 // errors that may have happened afterwards, before deciding whether to
643 // return an error from this function.
644
645 // CheckHyphens = false
646 // (Specified in the current UTS #46 draft rev. 18.)
647 // Refs:
648 // - https://github.com/whatwg/url/issues/53
649 // - https://github.com/whatwg/url/pull/309
650 // - http://www.unicode.org/review/pri317/
651 // - http://www.unicode.org/reports/tr46/tr46-18.html
652 // - https://www.icann.org/news/announcement-2000-01-07-en
653 info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
654 info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
655 info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
656
657 if (mode != IDNA_STRICT) {
658 // VerifyDnsLength = beStrict
659 info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
660 info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
661 info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
662 }
663
664 if (U_FAILURE(status) || (mode != IDNA_LENIENT && info.errors != 0)) {
665 len = -1;
666 buf->SetLength(0);
667 } else {
668 buf->SetLength(len);
669 }
670
671 uidna_close(uidna);
672 return len;
673 }
674
ToUnicode(const FunctionCallbackInfo<Value> & args)675 static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
676 Environment* env = Environment::GetCurrent(args);
677 CHECK_GE(args.Length(), 1);
678 CHECK(args[0]->IsString());
679 Utf8Value val(env->isolate(), args[0]);
680
681 MaybeStackBuffer<char> buf;
682 int32_t len = ToUnicode(&buf, *val, val.length());
683
684 if (len < 0) {
685 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
686 }
687
688 args.GetReturnValue().Set(
689 String::NewFromUtf8(env->isolate(),
690 *buf,
691 NewStringType::kNormal,
692 len).ToLocalChecked());
693 }
694
ToASCII(const FunctionCallbackInfo<Value> & args)695 static void ToASCII(const FunctionCallbackInfo<Value>& args) {
696 Environment* env = Environment::GetCurrent(args);
697 CHECK_GE(args.Length(), 1);
698 CHECK(args[0]->IsString());
699 Utf8Value val(env->isolate(), args[0]);
700 // optional arg
701 bool lenient = args[1]->BooleanValue(env->isolate());
702 enum idna_mode mode = lenient ? IDNA_LENIENT : IDNA_DEFAULT;
703
704 MaybeStackBuffer<char> buf;
705 int32_t len = ToASCII(&buf, *val, val.length(), mode);
706
707 if (len < 0) {
708 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
709 }
710
711 args.GetReturnValue().Set(
712 String::NewFromUtf8(env->isolate(),
713 *buf,
714 NewStringType::kNormal,
715 len).ToLocalChecked());
716 }
717
718 // This is similar to wcwidth except that it takes the current unicode
719 // character properties database into consideration, allowing it to
720 // correctly calculate the column widths of things like emoji's and
721 // newer wide characters. wcwidth, on the other hand, uses a fixed
722 // algorithm that does not take things like emoji into proper
723 // consideration.
724 //
725 // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
726 // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
727 // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
728 // allow it to be input. Linux's PTY terminal prints control characters as
729 // Narrow rhombi.
730 //
731 // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
732 // consonants are 0-width when combined with initial consonants; otherwise they
733 // are technically Wide. But many terminals (including Konsole and
734 // VTE/GLib-based) implement all medials and finals as 0-width.
735 //
736 // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
737 // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
738 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
GetColumnWidth(UChar32 codepoint,bool ambiguous_as_full_width=false)739 static int GetColumnWidth(UChar32 codepoint,
740 bool ambiguous_as_full_width = false) {
741 // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
742 // codepoint as being full width, wide, ambiguous, neutral, narrow,
743 // or halfwidth.
744 const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
745 switch (eaw) {
746 case U_EA_FULLWIDTH:
747 case U_EA_WIDE:
748 return 2;
749 case U_EA_AMBIGUOUS:
750 // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
751 if (ambiguous_as_full_width) {
752 return 2;
753 }
754 // If ambiguous_as_full_width is false:
755 // Fall through
756 case U_EA_NEUTRAL:
757 if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
758 return 2;
759 }
760 // Fall through
761 case U_EA_HALFWIDTH:
762 case U_EA_NARROW:
763 default:
764 const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code
765 U_GC_CF_MASK | // Format control character
766 U_GC_ME_MASK | // Enclosing mark
767 U_GC_MN_MASK; // Nonspacing mark
768 if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width
769 ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
770 u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
771 return 0;
772 }
773 return 1;
774 }
775 }
776
777 // Returns the column width for the given String.
GetStringWidth(const FunctionCallbackInfo<Value> & args)778 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
779 Environment* env = Environment::GetCurrent(args);
780 CHECK(args[0]->IsString());
781
782 bool ambiguous_as_full_width = args[1]->IsTrue();
783 bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
784
785 TwoByteValue value(env->isolate(), args[0]);
786 // reinterpret_cast is required by windows to compile
787 UChar* str = reinterpret_cast<UChar*>(*value);
788 static_assert(sizeof(*str) == sizeof(**value),
789 "sizeof(*str) == sizeof(**value)");
790 UChar32 c = 0;
791 UChar32 p;
792 size_t n = 0;
793 uint32_t width = 0;
794
795 while (n < value.length()) {
796 p = c;
797 U16_NEXT(str, n, value.length(), c);
798 // Don't count individual emoji codepoints that occur within an
799 // emoji sequence. This is not necessarily foolproof. Some
800 // environments display emoji sequences in the appropriate
801 // condensed form (as a single emoji glyph), other environments
802 // may not understand an emoji sequence and will display each
803 // individual emoji separately. When this happens, the width
804 // calculated will be off, and there's no reliable way of knowing
805 // in advance if a particular sequence is going to be supported.
806 // The expand_emoji_sequence option allows the caller to skip this
807 // check and count each code within an emoji sequence separately.
808 // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
809 if (!expand_emoji_sequence &&
810 n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner)
811 (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
812 u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
813 continue;
814 }
815 width += GetColumnWidth(c, ambiguous_as_full_width);
816 }
817 args.GetReturnValue().Set(width);
818 }
819
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)820 void Initialize(Local<Object> target,
821 Local<Value> unused,
822 Local<Context> context,
823 void* priv) {
824 Environment* env = Environment::GetCurrent(context);
825 env->SetMethod(target, "toUnicode", ToUnicode);
826 env->SetMethod(target, "toASCII", ToASCII);
827 env->SetMethod(target, "getStringWidth", GetStringWidth);
828
829 // One-shot converters
830 env->SetMethod(target, "icuErrName", ICUErrorName);
831 env->SetMethod(target, "transcode", Transcode);
832
833 // ConverterObject
834 {
835 Local<FunctionTemplate> t = FunctionTemplate::New(env->isolate());
836 t->Inherit(BaseObject::GetConstructorTemplate(env));
837 t->InstanceTemplate()->SetInternalFieldCount(
838 ConverterObject::kInternalFieldCount);
839 Local<String> converter_string =
840 FIXED_ONE_BYTE_STRING(env->isolate(), "Converter");
841 t->SetClassName(converter_string);
842 env->set_i18n_converter_template(t->InstanceTemplate());
843 }
844
845 env->SetMethod(target, "getConverter", ConverterObject::Create);
846 env->SetMethod(target, "decode", ConverterObject::Decode);
847 env->SetMethod(target, "hasConverter", ConverterObject::Has);
848 }
849
850 } // namespace i18n
851 } // namespace node
852
853 NODE_MODULE_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize)
854
855 #endif // NODE_HAVE_I18N_SUPPORT
856