1 // Copyright Joyent, Inc. and other Node contributors.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a
4 // copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to permit
8 // persons to whom the Software is furnished to do so, subject to the
9 // following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included
12 // in all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
21
22 /*
23 * notes: by srl295
24 * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25 * ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26 * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27 * macro names. That's the "english+root" data.
28 *
29 * If icu_data_path is non-null, the user has provided a path and we assume
30 * it goes somewhere useful. We set that path in ICU, and exit.
31 * If icu_data_path is null, they haven't set a path and we want the
32 * "english+root" data. We call
33 * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34 * to load up the english+root data.
35 *
36 * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37 * data. All of the variables and command line options for changing data at
38 * runtime are disabled, as they wouldn't fully override the internal data.
39 * See: http://bugs.icu-project.org/trac/ticket/10924
40 */
41
42
43 #include "node_i18n.h"
44 #include "node_external_reference.h"
45
46 #if defined(NODE_HAVE_I18N_SUPPORT)
47
48 #include "base_object-inl.h"
49 #include "node.h"
50 #include "node_buffer.h"
51 #include "node_errors.h"
52 #include "node_internals.h"
53 #include "string_bytes.h"
54 #include "util-inl.h"
55 #include "v8.h"
56
57 #include <unicode/utypes.h>
58 #include <unicode/putil.h>
59 #include <unicode/uchar.h>
60 #include <unicode/uclean.h>
61 #include <unicode/udata.h>
62 #include <unicode/uidna.h>
63 #include <unicode/ucnv.h>
64 #include <unicode/utf8.h>
65 #include <unicode/utf16.h>
66 #include <unicode/timezone.h>
67 #include <unicode/ulocdata.h>
68 #include <unicode/uvernum.h>
69 #include <unicode/uversion.h>
70 #include <unicode/ustring.h>
71
72 #ifdef NODE_HAVE_SMALL_ICU
73 /* if this is defined, we have a 'secondary' entry point.
74 compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
75 #define SMALL_ICUDATA_ENTRY_POINT \
76 SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
77 #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
78 #ifndef U_LIB_SUFFIX_C_NAME
79 #define SMALL_DEF(major, suff) icusmdt##major##_dat
80 #else
81 #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
82 #endif
83
84 extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
85 #endif
86
87 namespace node {
88
89 using v8::Context;
90 using v8::FunctionCallbackInfo;
91 using v8::FunctionTemplate;
92 using v8::Int32;
93 using v8::Isolate;
94 using v8::Local;
95 using v8::MaybeLocal;
96 using v8::NewStringType;
97 using v8::Object;
98 using v8::ObjectTemplate;
99 using v8::String;
100 using v8::Value;
101
102 namespace i18n {
103 namespace {
104
105 template <typename T>
ToBufferEndian(Environment * env,MaybeStackBuffer<T> * buf)106 MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
107 MaybeLocal<Object> ret = Buffer::New(env, buf);
108 if (ret.IsEmpty())
109 return ret;
110
111 static_assert(sizeof(T) == 1 || sizeof(T) == 2,
112 "Currently only one- or two-byte buffers are supported");
113 if (sizeof(T) > 1 && IsBigEndian()) {
114 SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
115 SwapBytes16(retbuf_data, retbuf_length);
116 }
117
118 return ret;
119 }
120
121 // One-Shot Converters
122
CopySourceBuffer(MaybeStackBuffer<UChar> * dest,const char * data,const size_t length,const size_t length_in_chars)123 void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
124 const char* data,
125 const size_t length,
126 const size_t length_in_chars) {
127 dest->AllocateSufficientStorage(length_in_chars);
128 char* dst = reinterpret_cast<char*>(**dest);
129 memcpy(dst, data, length);
130 if (IsBigEndian()) {
131 SwapBytes16(dst, length);
132 }
133 }
134
135 typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
136 const char* fromEncoding,
137 const char* toEncoding,
138 const char* source,
139 const size_t source_length,
140 UErrorCode* status);
141
Transcode(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)142 MaybeLocal<Object> Transcode(Environment* env,
143 const char* fromEncoding,
144 const char* toEncoding,
145 const char* source,
146 const size_t source_length,
147 UErrorCode* status) {
148 *status = U_ZERO_ERROR;
149 MaybeLocal<Object> ret;
150 MaybeStackBuffer<char> result;
151 Converter to(toEncoding);
152 Converter from(fromEncoding);
153
154 size_t sublen = ucnv_getMinCharSize(to.conv());
155 std::string sub(sublen, '?');
156 to.set_subst_chars(sub.c_str());
157
158 const uint32_t limit = source_length * to.max_char_size();
159 result.AllocateSufficientStorage(limit);
160 char* target = *result;
161 ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
162 &source, source + source_length, nullptr, nullptr,
163 nullptr, nullptr, true, true, status);
164 if (U_SUCCESS(*status)) {
165 result.SetLength(target - &result[0]);
166 ret = ToBufferEndian(env, &result);
167 }
168 return ret;
169 }
170
TranscodeToUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)171 MaybeLocal<Object> TranscodeToUcs2(Environment* env,
172 const char* fromEncoding,
173 const char* toEncoding,
174 const char* source,
175 const size_t source_length,
176 UErrorCode* status) {
177 *status = U_ZERO_ERROR;
178 MaybeLocal<Object> ret;
179 MaybeStackBuffer<UChar> destbuf(source_length);
180 Converter from(fromEncoding);
181 const size_t length_in_chars = source_length * sizeof(UChar);
182 ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
183 source, source_length, status);
184 if (U_SUCCESS(*status))
185 ret = ToBufferEndian(env, &destbuf);
186 return ret;
187 }
188
TranscodeFromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)189 MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
190 const char* fromEncoding,
191 const char* toEncoding,
192 const char* source,
193 const size_t source_length,
194 UErrorCode* status) {
195 *status = U_ZERO_ERROR;
196 MaybeStackBuffer<UChar> sourcebuf;
197 MaybeLocal<Object> ret;
198 Converter to(toEncoding);
199
200 size_t sublen = ucnv_getMinCharSize(to.conv());
201 std::string sub(sublen, '?');
202 to.set_subst_chars(sub.c_str());
203
204 const size_t length_in_chars = source_length / sizeof(UChar);
205 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
206 MaybeStackBuffer<char> destbuf(length_in_chars);
207 const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
208 *sourcebuf, length_in_chars, status);
209 if (U_SUCCESS(*status)) {
210 destbuf.SetLength(len);
211 ret = ToBufferEndian(env, &destbuf);
212 }
213 return ret;
214 }
215
TranscodeUcs2FromUtf8(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)216 MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
217 const char* fromEncoding,
218 const char* toEncoding,
219 const char* source,
220 const size_t source_length,
221 UErrorCode* status) {
222 *status = U_ZERO_ERROR;
223 MaybeStackBuffer<UChar> destbuf;
224 int32_t result_length;
225 u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
226 source, source_length, status);
227 MaybeLocal<Object> ret;
228 if (U_SUCCESS(*status)) {
229 destbuf.SetLength(result_length);
230 ret = ToBufferEndian(env, &destbuf);
231 } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
232 *status = U_ZERO_ERROR;
233 destbuf.AllocateSufficientStorage(result_length);
234 u_strFromUTF8(*destbuf, result_length, &result_length,
235 source, source_length, status);
236 if (U_SUCCESS(*status)) {
237 destbuf.SetLength(result_length);
238 ret = ToBufferEndian(env, &destbuf);
239 }
240 }
241 return ret;
242 }
243
TranscodeUtf8FromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)244 MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
245 const char* fromEncoding,
246 const char* toEncoding,
247 const char* source,
248 const size_t source_length,
249 UErrorCode* status) {
250 *status = U_ZERO_ERROR;
251 MaybeLocal<Object> ret;
252 const size_t length_in_chars = source_length / sizeof(UChar);
253 int32_t result_length;
254 MaybeStackBuffer<UChar> sourcebuf;
255 MaybeStackBuffer<char> destbuf;
256 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
257 u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
258 *sourcebuf, length_in_chars, status);
259 if (U_SUCCESS(*status)) {
260 destbuf.SetLength(result_length);
261 ret = ToBufferEndian(env, &destbuf);
262 } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
263 *status = U_ZERO_ERROR;
264 destbuf.AllocateSufficientStorage(result_length);
265 u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
266 length_in_chars, status);
267 if (U_SUCCESS(*status)) {
268 destbuf.SetLength(result_length);
269 ret = ToBufferEndian(env, &destbuf);
270 }
271 }
272 return ret;
273 }
274
EncodingName(const enum encoding encoding)275 const char* EncodingName(const enum encoding encoding) {
276 switch (encoding) {
277 case ASCII: return "us-ascii";
278 case LATIN1: return "iso8859-1";
279 case UCS2: return "utf16le";
280 case UTF8: return "utf-8";
281 default: return nullptr;
282 }
283 }
284
SupportedEncoding(const enum encoding encoding)285 bool SupportedEncoding(const enum encoding encoding) {
286 switch (encoding) {
287 case ASCII:
288 case LATIN1:
289 case UCS2:
290 case UTF8: return true;
291 default: return false;
292 }
293 }
294
Transcode(const FunctionCallbackInfo<Value> & args)295 void Transcode(const FunctionCallbackInfo<Value>&args) {
296 Environment* env = Environment::GetCurrent(args);
297 Isolate* isolate = env->isolate();
298 UErrorCode status = U_ZERO_ERROR;
299 MaybeLocal<Object> result;
300
301 ArrayBufferViewContents<char> input(args[0]);
302 const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
303 const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
304
305 if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
306 TranscodeFunc tfn = &Transcode;
307 switch (fromEncoding) {
308 case ASCII:
309 case LATIN1:
310 if (toEncoding == UCS2)
311 tfn = &TranscodeToUcs2;
312 break;
313 case UTF8:
314 if (toEncoding == UCS2)
315 tfn = &TranscodeUcs2FromUtf8;
316 break;
317 case UCS2:
318 switch (toEncoding) {
319 case UCS2:
320 tfn = &Transcode;
321 break;
322 case UTF8:
323 tfn = &TranscodeUtf8FromUcs2;
324 break;
325 default:
326 tfn = &TranscodeFromUcs2;
327 }
328 break;
329 default:
330 // This should not happen because of the SupportedEncoding checks
331 ABORT();
332 }
333
334 result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
335 input.data(), input.length(), &status);
336 } else {
337 status = U_ILLEGAL_ARGUMENT_ERROR;
338 }
339
340 if (result.IsEmpty())
341 return args.GetReturnValue().Set(status);
342
343 return args.GetReturnValue().Set(result.ToLocalChecked());
344 }
345
ICUErrorName(const FunctionCallbackInfo<Value> & args)346 void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
347 Environment* env = Environment::GetCurrent(args);
348 CHECK(args[0]->IsInt32());
349 UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
350 args.GetReturnValue().Set(
351 String::NewFromUtf8(env->isolate(),
352 u_errorName(status)).ToLocalChecked());
353 }
354
355 } // anonymous namespace
356
Converter(const char * name,const char * sub)357 Converter::Converter(const char* name, const char* sub) {
358 UErrorCode status = U_ZERO_ERROR;
359 UConverter* conv = ucnv_open(name, &status);
360 CHECK(U_SUCCESS(status));
361 conv_.reset(conv);
362 set_subst_chars(sub);
363 }
364
Converter(UConverter * converter,const char * sub)365 Converter::Converter(UConverter* converter, const char* sub)
366 : conv_(converter) {
367 set_subst_chars(sub);
368 }
369
set_subst_chars(const char * sub)370 void Converter::set_subst_chars(const char* sub) {
371 CHECK(conv_);
372 UErrorCode status = U_ZERO_ERROR;
373 if (sub != nullptr) {
374 ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
375 CHECK(U_SUCCESS(status));
376 }
377 }
378
reset()379 void Converter::reset() {
380 ucnv_reset(conv_.get());
381 }
382
min_char_size() const383 size_t Converter::min_char_size() const {
384 CHECK(conv_);
385 return ucnv_getMinCharSize(conv_.get());
386 }
387
max_char_size() const388 size_t Converter::max_char_size() const {
389 CHECK(conv_);
390 return ucnv_getMaxCharSize(conv_.get());
391 }
392
Has(const FunctionCallbackInfo<Value> & args)393 void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
394 Environment* env = Environment::GetCurrent(args);
395
396 CHECK_GE(args.Length(), 1);
397 Utf8Value label(env->isolate(), args[0]);
398
399 UErrorCode status = U_ZERO_ERROR;
400 ConverterPointer conv(ucnv_open(*label, &status));
401 args.GetReturnValue().Set(!!U_SUCCESS(status));
402 }
403
Create(const FunctionCallbackInfo<Value> & args)404 void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
405 Environment* env = Environment::GetCurrent(args);
406
407 Local<ObjectTemplate> t = env->i18n_converter_template();
408 Local<Object> obj;
409 if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
410
411 CHECK_GE(args.Length(), 2);
412 Utf8Value label(env->isolate(), args[0]);
413 int flags = args[1]->Uint32Value(env->context()).ToChecked();
414 bool fatal =
415 (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
416
417 UErrorCode status = U_ZERO_ERROR;
418 UConverter* conv = ucnv_open(*label, &status);
419 if (U_FAILURE(status))
420 return;
421
422 if (fatal) {
423 status = U_ZERO_ERROR;
424 ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
425 nullptr, nullptr, nullptr, &status);
426 }
427
428 auto converter = new ConverterObject(env, obj, conv, flags);
429 size_t sublen = ucnv_getMinCharSize(conv);
430 std::string sub(sublen, '?');
431 converter->set_subst_chars(sub.c_str());
432
433 args.GetReturnValue().Set(obj);
434 }
435
Decode(const FunctionCallbackInfo<Value> & args)436 void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
437 Environment* env = Environment::GetCurrent(args);
438
439 CHECK_GE(args.Length(), 4); // Converter, Buffer, Flags, Encoding
440
441 ConverterObject* converter;
442 ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
443
444 if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() ||
445 args[1]->IsArrayBufferView())) {
446 return node::THROW_ERR_INVALID_ARG_TYPE(
447 env->isolate(),
448 "The \"input\" argument must be an instance of SharedArrayBuffer, "
449 "ArrayBuffer or ArrayBufferView.");
450 }
451
452 ArrayBufferViewContents<char> input(args[1]);
453 int flags = args[2]->Uint32Value(env->context()).ToChecked();
454
455 CHECK(args[3]->IsString());
456 Local<String> from_encoding = args[3].As<String>();
457
458 UErrorCode status = U_ZERO_ERROR;
459 MaybeStackBuffer<UChar> result;
460
461 UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
462
463 // When flushing the final chunk, the limit is the maximum
464 // of either the input buffer length or the number of pending
465 // characters times the min char size, multiplied by 2 as unicode may
466 // take up to 2 UChars to encode a character
467 size_t limit = 2 * converter->min_char_size() *
468 (!flush ?
469 input.length() :
470 std::max(
471 input.length(),
472 static_cast<size_t>(
473 ucnv_toUCountPending(converter->conv(), &status))));
474 status = U_ZERO_ERROR;
475
476 if (limit > 0)
477 result.AllocateSufficientStorage(limit);
478
479 auto cleanup = OnScopeLeave([&]() {
480 if (flush) {
481 // Reset the converter state.
482 converter->set_bom_seen(false);
483 converter->reset();
484 }
485 });
486
487 const char* source = input.data();
488 size_t source_length = input.length();
489
490 UChar* target = *result;
491 ucnv_toUnicode(converter->conv(),
492 &target,
493 target + limit,
494 &source,
495 source + source_length,
496 nullptr,
497 flush,
498 &status);
499
500 if (U_SUCCESS(status)) {
501 bool omit_initial_bom = false;
502 if (limit > 0) {
503 result.SetLength(target - &result[0]);
504 if (result.length() > 0 &&
505 converter->unicode() &&
506 !converter->ignore_bom() &&
507 !converter->bom_seen()) {
508 // If the very first result in the stream is a BOM, and we are not
509 // explicitly told to ignore it, then we mark it for discarding.
510 if (result[0] == 0xFEFF)
511 omit_initial_bom = true;
512 converter->set_bom_seen(true);
513 }
514 }
515
516 Local<Value> error;
517 UChar* output = result.out();
518 size_t beginning = 0;
519 size_t length = result.length() * sizeof(UChar);
520
521 if (omit_initial_bom) {
522 // Perform `ret = ret.slice(2)`.
523 beginning += 2;
524 length -= 2;
525 }
526
527 char* value = reinterpret_cast<char*>(output) + beginning;
528
529 if (IsBigEndian()) {
530 SwapBytes16(value, length);
531 }
532
533 MaybeLocal<Value> encoded =
534 StringBytes::Encode(env->isolate(), value, length, UCS2, &error);
535
536 Local<Value> ret;
537 if (encoded.ToLocal(&ret)) {
538 args.GetReturnValue().Set(ret);
539 return;
540 }
541 }
542
543 node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
544 env->isolate(),
545 "The encoded data was not valid for encoding %s",
546 *node::Utf8Value(env->isolate(), from_encoding));
547 }
548
ConverterObject(Environment * env,Local<Object> wrap,UConverter * converter,int flags,const char * sub)549 ConverterObject::ConverterObject(
550 Environment* env,
551 Local<Object> wrap,
552 UConverter* converter,
553 int flags,
554 const char* sub)
555 : BaseObject(env, wrap),
556 Converter(converter, sub),
557 flags_(flags) {
558 MakeWeak();
559
560 switch (ucnv_getType(converter)) {
561 case UCNV_UTF8:
562 case UCNV_UTF16_BigEndian:
563 case UCNV_UTF16_LittleEndian:
564 flags_ |= CONVERTER_FLAGS_UNICODE;
565 break;
566 default: {
567 // Fall through
568 }
569 }
570 }
571
572
InitializeICUDirectory(const std::string & path)573 bool InitializeICUDirectory(const std::string& path) {
574 UErrorCode status = U_ZERO_ERROR;
575 if (path.empty()) {
576 #ifdef NODE_HAVE_SMALL_ICU
577 // install the 'small' data.
578 udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
579 #else // !NODE_HAVE_SMALL_ICU
580 // no small data, so nothing to do.
581 #endif // !NODE_HAVE_SMALL_ICU
582 } else {
583 u_setDataDirectory(path.c_str());
584 u_init(&status);
585 }
586 return status == U_ZERO_ERROR;
587 }
588
SetDefaultTimeZone(const char * tzid)589 void SetDefaultTimeZone(const char* tzid) {
590 size_t tzidlen = strlen(tzid) + 1;
591 UErrorCode status = U_ZERO_ERROR;
592 MaybeStackBuffer<UChar, 256> id(tzidlen);
593 u_charsToUChars(tzid, id.out(), tzidlen);
594 // This is threadsafe:
595 ucal_setDefaultTimeZone(id.out(), &status);
596 CHECK(U_SUCCESS(status));
597 }
598
ToUnicode(MaybeStackBuffer<char> * buf,const char * input,size_t length)599 int32_t ToUnicode(MaybeStackBuffer<char>* buf,
600 const char* input,
601 size_t length) {
602 UErrorCode status = U_ZERO_ERROR;
603 uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
604 UIDNA* uidna = uidna_openUTS46(options, &status);
605 if (U_FAILURE(status))
606 return -1;
607 UIDNAInfo info = UIDNA_INFO_INITIALIZER;
608
609 int32_t len = uidna_nameToUnicodeUTF8(uidna,
610 input, length,
611 **buf, buf->capacity(),
612 &info,
613 &status);
614
615 // Do not check info.errors like we do with ToASCII since ToUnicode always
616 // returns a string, despite any possible errors that may have occurred.
617
618 if (status == U_BUFFER_OVERFLOW_ERROR) {
619 status = U_ZERO_ERROR;
620 buf->AllocateSufficientStorage(len);
621 len = uidna_nameToUnicodeUTF8(uidna,
622 input, length,
623 **buf, buf->capacity(),
624 &info,
625 &status);
626 }
627
628 // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
629 // string, regardless of whether an error occurred.
630
631 if (U_FAILURE(status)) {
632 len = -1;
633 buf->SetLength(0);
634 } else {
635 buf->SetLength(len);
636 }
637
638 uidna_close(uidna);
639 return len;
640 }
641
ToASCII(MaybeStackBuffer<char> * buf,const char * input,size_t length,idna_mode mode)642 int32_t ToASCII(MaybeStackBuffer<char>* buf,
643 const char* input,
644 size_t length,
645 idna_mode mode) {
646 UErrorCode status = U_ZERO_ERROR;
647 uint32_t options = // CheckHyphens = false; handled later
648 UIDNA_CHECK_BIDI | // CheckBidi = true
649 UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true
650 UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing
651 if (mode == idna_mode::kStrict) {
652 options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict
653 // VerifyDnsLength = beStrict;
654 // handled later
655 }
656
657 UIDNA* uidna = uidna_openUTS46(options, &status);
658 if (U_FAILURE(status))
659 return -1;
660 UIDNAInfo info = UIDNA_INFO_INITIALIZER;
661
662 int32_t len = uidna_nameToASCII_UTF8(uidna,
663 input, length,
664 **buf, buf->capacity(),
665 &info,
666 &status);
667
668 if (status == U_BUFFER_OVERFLOW_ERROR) {
669 status = U_ZERO_ERROR;
670 buf->AllocateSufficientStorage(len);
671 len = uidna_nameToASCII_UTF8(uidna,
672 input, length,
673 **buf, buf->capacity(),
674 &info,
675 &status);
676 }
677
678 // In UTS #46 which specifies ToASCII, certain error conditions are
679 // configurable through options, and the WHATWG URL Standard promptly elects
680 // to disable some of them to accommodate for real-world use cases.
681 // Unfortunately, ICU4C's IDNA module does not support disabling some of
682 // these options through `options` above, and thus continues throwing
683 // unnecessary errors. To counter this situation, we just filter out the
684 // errors that may have happened afterwards, before deciding whether to
685 // return an error from this function.
686
687 // CheckHyphens = false
688 // (Specified in the current UTS #46 draft rev. 18.)
689 // Refs:
690 // - https://github.com/whatwg/url/issues/53
691 // - https://github.com/whatwg/url/pull/309
692 // - http://www.unicode.org/review/pri317/
693 // - http://www.unicode.org/reports/tr46/tr46-18.html
694 // - https://www.icann.org/news/announcement-2000-01-07-en
695 info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
696 info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
697 info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
698
699 if (mode != idna_mode::kStrict) {
700 // VerifyDnsLength = beStrict
701 info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
702 info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
703 info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
704 }
705
706 if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) {
707 len = -1;
708 buf->SetLength(0);
709 } else {
710 buf->SetLength(len);
711 }
712
713 uidna_close(uidna);
714 return len;
715 }
716
ToUnicode(const FunctionCallbackInfo<Value> & args)717 static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
718 Environment* env = Environment::GetCurrent(args);
719 CHECK_GE(args.Length(), 1);
720 CHECK(args[0]->IsString());
721 Utf8Value val(env->isolate(), args[0]);
722
723 MaybeStackBuffer<char> buf;
724 int32_t len = ToUnicode(&buf, *val, val.length());
725
726 if (len < 0) {
727 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
728 }
729
730 args.GetReturnValue().Set(
731 String::NewFromUtf8(env->isolate(),
732 *buf,
733 NewStringType::kNormal,
734 len).ToLocalChecked());
735 }
736
ToASCII(const FunctionCallbackInfo<Value> & args)737 static void ToASCII(const FunctionCallbackInfo<Value>& args) {
738 Environment* env = Environment::GetCurrent(args);
739 CHECK_GE(args.Length(), 1);
740 CHECK(args[0]->IsString());
741 Utf8Value val(env->isolate(), args[0]);
742 // optional arg
743 bool lenient = args[1]->BooleanValue(env->isolate());
744 idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault;
745
746 MaybeStackBuffer<char> buf;
747 int32_t len = ToASCII(&buf, *val, val.length(), mode);
748
749 if (len < 0) {
750 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
751 }
752
753 args.GetReturnValue().Set(
754 String::NewFromUtf8(env->isolate(),
755 *buf,
756 NewStringType::kNormal,
757 len).ToLocalChecked());
758 }
759
760 // This is similar to wcwidth except that it takes the current unicode
761 // character properties database into consideration, allowing it to
762 // correctly calculate the column widths of things like emoji's and
763 // newer wide characters. wcwidth, on the other hand, uses a fixed
764 // algorithm that does not take things like emoji into proper
765 // consideration.
766 //
767 // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
768 // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
769 // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
770 // allow it to be input. Linux's PTY terminal prints control characters as
771 // Narrow rhombi.
772 //
773 // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
774 // consonants are 0-width when combined with initial consonants; otherwise they
775 // are technically Wide. But many terminals (including Konsole and
776 // VTE/GLib-based) implement all medials and finals as 0-width.
777 //
778 // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
779 // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
780 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
GetColumnWidth(UChar32 codepoint,bool ambiguous_as_full_width=false)781 static int GetColumnWidth(UChar32 codepoint,
782 bool ambiguous_as_full_width = false) {
783 // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
784 // codepoint as being full width, wide, ambiguous, neutral, narrow,
785 // or halfwidth.
786 const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
787 switch (eaw) {
788 case U_EA_FULLWIDTH:
789 case U_EA_WIDE:
790 return 2;
791 case U_EA_AMBIGUOUS:
792 // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
793 if (ambiguous_as_full_width) {
794 return 2;
795 }
796 // If ambiguous_as_full_width is false:
797 [[fallthrough]];
798 case U_EA_NEUTRAL:
799 if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
800 return 2;
801 }
802 [[fallthrough]];
803 case U_EA_HALFWIDTH:
804 case U_EA_NARROW:
805 default:
806 const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code
807 U_GC_CF_MASK | // Format control character
808 U_GC_ME_MASK | // Enclosing mark
809 U_GC_MN_MASK; // Nonspacing mark
810 if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width
811 ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
812 u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
813 return 0;
814 }
815 return 1;
816 }
817 }
818
819 // Returns the column width for the given String.
GetStringWidth(const FunctionCallbackInfo<Value> & args)820 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
821 Environment* env = Environment::GetCurrent(args);
822 CHECK(args[0]->IsString());
823
824 bool ambiguous_as_full_width = args[1]->IsTrue();
825 bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
826
827 TwoByteValue value(env->isolate(), args[0]);
828 // reinterpret_cast is required by windows to compile
829 UChar* str = reinterpret_cast<UChar*>(*value);
830 static_assert(sizeof(*str) == sizeof(**value),
831 "sizeof(*str) == sizeof(**value)");
832 UChar32 c = 0;
833 UChar32 p;
834 size_t n = 0;
835 uint32_t width = 0;
836
837 while (n < value.length()) {
838 p = c;
839 U16_NEXT(str, n, value.length(), c);
840 // Don't count individual emoji codepoints that occur within an
841 // emoji sequence. This is not necessarily foolproof. Some
842 // environments display emoji sequences in the appropriate
843 // condensed form (as a single emoji glyph), other environments
844 // may not understand an emoji sequence and will display each
845 // individual emoji separately. When this happens, the width
846 // calculated will be off, and there's no reliable way of knowing
847 // in advance if a particular sequence is going to be supported.
848 // The expand_emoji_sequence option allows the caller to skip this
849 // check and count each code within an emoji sequence separately.
850 // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
851 if (!expand_emoji_sequence &&
852 n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner)
853 (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
854 u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
855 continue;
856 }
857 width += GetColumnWidth(c, ambiguous_as_full_width);
858 }
859 args.GetReturnValue().Set(width);
860 }
861
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)862 void Initialize(Local<Object> target,
863 Local<Value> unused,
864 Local<Context> context,
865 void* priv) {
866 Environment* env = Environment::GetCurrent(context);
867 SetMethod(context, target, "toUnicode", ToUnicode);
868 SetMethod(context, target, "toASCII", ToASCII);
869 SetMethod(context, target, "getStringWidth", GetStringWidth);
870
871 // One-shot converters
872 SetMethod(context, target, "icuErrName", ICUErrorName);
873 SetMethod(context, target, "transcode", Transcode);
874
875 // ConverterObject
876 {
877 Local<FunctionTemplate> t = NewFunctionTemplate(env->isolate(), nullptr);
878 t->Inherit(BaseObject::GetConstructorTemplate(env));
879 t->InstanceTemplate()->SetInternalFieldCount(
880 ConverterObject::kInternalFieldCount);
881 Local<String> converter_string =
882 FIXED_ONE_BYTE_STRING(env->isolate(), "Converter");
883 t->SetClassName(converter_string);
884 env->set_i18n_converter_template(t->InstanceTemplate());
885 }
886
887 SetMethod(context, target, "getConverter", ConverterObject::Create);
888 SetMethod(context, target, "decode", ConverterObject::Decode);
889 SetMethod(context, target, "hasConverter", ConverterObject::Has);
890 }
891
RegisterExternalReferences(ExternalReferenceRegistry * registry)892 void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
893 registry->Register(ToUnicode);
894 registry->Register(ToASCII);
895 registry->Register(GetStringWidth);
896 registry->Register(ICUErrorName);
897 registry->Register(Transcode);
898 registry->Register(ConverterObject::Create);
899 registry->Register(ConverterObject::Decode);
900 registry->Register(ConverterObject::Has);
901 }
902
903 } // namespace i18n
904 } // namespace node
905
906 NODE_BINDING_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize)
907 NODE_BINDING_EXTERNAL_REFERENCE(icu, node::i18n::RegisterExternalReferences)
908
909 #endif // NODE_HAVE_I18N_SUPPORT
910