• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright Joyent, Inc. and other Node contributors.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a
4 // copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to permit
8 // persons to whom the Software is furnished to do so, subject to the
9 // following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included
12 // in all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
21 
22 /*
23  * notes: by srl295
24  *  - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25  *     ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26  *    linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27  *    macro names. That's the "english+root" data.
28  *
29  *    If icu_data_path is non-null, the user has provided a path and we assume
30  *    it goes somewhere useful. We set that path in ICU, and exit.
31  *    If icu_data_path is null, they haven't set a path and we want the
32  *    "english+root" data.  We call
33  *       udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34  *    to load up the english+root data.
35  *
36  *  - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37  *    data. All of the variables and command line options for changing data at
38  *    runtime are disabled, as they wouldn't fully override the internal data.
39  *    See:  http://bugs.icu-project.org/trac/ticket/10924
40  */
41 
42 
43 #include "node_i18n.h"
44 
45 #if defined(NODE_HAVE_I18N_SUPPORT)
46 
47 #include "base_object-inl.h"
48 #include "node.h"
49 #include "node_buffer.h"
50 #include "node_errors.h"
51 #include "node_internals.h"
52 #include "util-inl.h"
53 #include "v8.h"
54 
55 #include <unicode/utypes.h>
56 #include <unicode/putil.h>
57 #include <unicode/uchar.h>
58 #include <unicode/uclean.h>
59 #include <unicode/udata.h>
60 #include <unicode/uidna.h>
61 #include <unicode/ucnv.h>
62 #include <unicode/utf8.h>
63 #include <unicode/utf16.h>
64 #include <unicode/timezone.h>
65 #include <unicode/ulocdata.h>
66 #include <unicode/uvernum.h>
67 #include <unicode/uversion.h>
68 #include <unicode/ustring.h>
69 
70 #ifdef NODE_HAVE_SMALL_ICU
71 /* if this is defined, we have a 'secondary' entry point.
72    compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
73 #define SMALL_ICUDATA_ENTRY_POINT \
74   SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
75 #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
76 #ifndef U_LIB_SUFFIX_C_NAME
77 #define SMALL_DEF(major, suff) icusmdt##major##_dat
78 #else
79 #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
80 #endif
81 
82 extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
83 #endif
84 
85 namespace node {
86 
87 using v8::Context;
88 using v8::FunctionCallbackInfo;
89 using v8::FunctionTemplate;
90 using v8::Int32;
91 using v8::Isolate;
92 using v8::Local;
93 using v8::MaybeLocal;
94 using v8::NewStringType;
95 using v8::Object;
96 using v8::ObjectTemplate;
97 using v8::String;
98 using v8::Uint8Array;
99 using v8::Value;
100 
101 namespace i18n {
102 namespace {
103 
104 template <typename T>
ToBufferEndian(Environment * env,MaybeStackBuffer<T> * buf)105 MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
106   MaybeLocal<Object> ret = Buffer::New(env, buf);
107   if (ret.IsEmpty())
108     return ret;
109 
110   static_assert(sizeof(T) == 1 || sizeof(T) == 2,
111                 "Currently only one- or two-byte buffers are supported");
112   if (sizeof(T) > 1 && IsBigEndian()) {
113     SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
114     SwapBytes16(retbuf_data, retbuf_length);
115   }
116 
117   return ret;
118 }
119 
120 // One-Shot Converters
121 
CopySourceBuffer(MaybeStackBuffer<UChar> * dest,const char * data,const size_t length,const size_t length_in_chars)122 void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
123                       const char* data,
124                       const size_t length,
125                       const size_t length_in_chars) {
126   dest->AllocateSufficientStorage(length_in_chars);
127   char* dst = reinterpret_cast<char*>(**dest);
128   memcpy(dst, data, length);
129   if (IsBigEndian()) {
130     SwapBytes16(dst, length);
131   }
132 }
133 
134 typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
135                                             const char* fromEncoding,
136                                             const char* toEncoding,
137                                             const char* source,
138                                             const size_t source_length,
139                                             UErrorCode* status);
140 
Transcode(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)141 MaybeLocal<Object> Transcode(Environment* env,
142                              const char* fromEncoding,
143                              const char* toEncoding,
144                              const char* source,
145                              const size_t source_length,
146                              UErrorCode* status) {
147   *status = U_ZERO_ERROR;
148   MaybeLocal<Object> ret;
149   MaybeStackBuffer<char> result;
150   Converter to(toEncoding);
151   Converter from(fromEncoding);
152 
153   size_t sublen = ucnv_getMinCharSize(to.conv());
154   std::string sub(sublen, '?');
155   to.set_subst_chars(sub.c_str());
156 
157   const uint32_t limit = source_length * to.max_char_size();
158   result.AllocateSufficientStorage(limit);
159   char* target = *result;
160   ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
161                  &source, source + source_length, nullptr, nullptr,
162                  nullptr, nullptr, true, true, status);
163   if (U_SUCCESS(*status)) {
164     result.SetLength(target - &result[0]);
165     ret = ToBufferEndian(env, &result);
166   }
167   return ret;
168 }
169 
TranscodeToUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)170 MaybeLocal<Object> TranscodeToUcs2(Environment* env,
171                                    const char* fromEncoding,
172                                    const char* toEncoding,
173                                    const char* source,
174                                    const size_t source_length,
175                                    UErrorCode* status) {
176   *status = U_ZERO_ERROR;
177   MaybeLocal<Object> ret;
178   MaybeStackBuffer<UChar> destbuf(source_length);
179   Converter from(fromEncoding);
180   const size_t length_in_chars = source_length * sizeof(UChar);
181   ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
182                 source, source_length, status);
183   if (U_SUCCESS(*status))
184     ret = ToBufferEndian(env, &destbuf);
185   return ret;
186 }
187 
TranscodeFromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)188 MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
189                                      const char* fromEncoding,
190                                      const char* toEncoding,
191                                      const char* source,
192                                      const size_t source_length,
193                                      UErrorCode* status) {
194   *status = U_ZERO_ERROR;
195   MaybeStackBuffer<UChar> sourcebuf;
196   MaybeLocal<Object> ret;
197   Converter to(toEncoding);
198 
199   size_t sublen = ucnv_getMinCharSize(to.conv());
200   std::string sub(sublen, '?');
201   to.set_subst_chars(sub.c_str());
202 
203   const size_t length_in_chars = source_length / sizeof(UChar);
204   CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
205   MaybeStackBuffer<char> destbuf(length_in_chars);
206   const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
207                                        *sourcebuf, length_in_chars, status);
208   if (U_SUCCESS(*status)) {
209     destbuf.SetLength(len);
210     ret = ToBufferEndian(env, &destbuf);
211   }
212   return ret;
213 }
214 
TranscodeUcs2FromUtf8(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)215 MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
216                                          const char* fromEncoding,
217                                          const char* toEncoding,
218                                          const char* source,
219                                          const size_t source_length,
220                                          UErrorCode* status) {
221   *status = U_ZERO_ERROR;
222   MaybeStackBuffer<UChar> destbuf;
223   int32_t result_length;
224   u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
225                 source, source_length, status);
226   MaybeLocal<Object> ret;
227   if (U_SUCCESS(*status)) {
228     destbuf.SetLength(result_length);
229     ret = ToBufferEndian(env, &destbuf);
230   } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
231     *status = U_ZERO_ERROR;
232     destbuf.AllocateSufficientStorage(result_length);
233     u_strFromUTF8(*destbuf, result_length, &result_length,
234                   source, source_length, status);
235     if (U_SUCCESS(*status)) {
236       destbuf.SetLength(result_length);
237       ret = ToBufferEndian(env, &destbuf);
238     }
239   }
240   return ret;
241 }
242 
TranscodeUtf8FromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)243 MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
244                                          const char* fromEncoding,
245                                          const char* toEncoding,
246                                          const char* source,
247                                          const size_t source_length,
248                                          UErrorCode* status) {
249   *status = U_ZERO_ERROR;
250   MaybeLocal<Object> ret;
251   const size_t length_in_chars = source_length / sizeof(UChar);
252   int32_t result_length;
253   MaybeStackBuffer<UChar> sourcebuf;
254   MaybeStackBuffer<char> destbuf;
255   CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
256   u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
257               *sourcebuf, length_in_chars, status);
258   if (U_SUCCESS(*status)) {
259     destbuf.SetLength(result_length);
260     ret = ToBufferEndian(env, &destbuf);
261   } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
262     *status = U_ZERO_ERROR;
263     destbuf.AllocateSufficientStorage(result_length);
264     u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
265                 length_in_chars, status);
266     if (U_SUCCESS(*status)) {
267       destbuf.SetLength(result_length);
268       ret = ToBufferEndian(env, &destbuf);
269     }
270   }
271   return ret;
272 }
273 
EncodingName(const enum encoding encoding)274 const char* EncodingName(const enum encoding encoding) {
275   switch (encoding) {
276     case ASCII: return "us-ascii";
277     case LATIN1: return "iso8859-1";
278     case UCS2: return "utf16le";
279     case UTF8: return "utf-8";
280     default: return nullptr;
281   }
282 }
283 
SupportedEncoding(const enum encoding encoding)284 bool SupportedEncoding(const enum encoding encoding) {
285   switch (encoding) {
286     case ASCII:
287     case LATIN1:
288     case UCS2:
289     case UTF8: return true;
290     default: return false;
291   }
292 }
293 
Transcode(const FunctionCallbackInfo<Value> & args)294 void Transcode(const FunctionCallbackInfo<Value>&args) {
295   Environment* env = Environment::GetCurrent(args);
296   Isolate* isolate = env->isolate();
297   UErrorCode status = U_ZERO_ERROR;
298   MaybeLocal<Object> result;
299 
300   ArrayBufferViewContents<char> input(args[0]);
301   const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
302   const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
303 
304   if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
305     TranscodeFunc tfn = &Transcode;
306     switch (fromEncoding) {
307       case ASCII:
308       case LATIN1:
309         if (toEncoding == UCS2)
310           tfn = &TranscodeToUcs2;
311         break;
312       case UTF8:
313         if (toEncoding == UCS2)
314           tfn = &TranscodeUcs2FromUtf8;
315         break;
316       case UCS2:
317         switch (toEncoding) {
318           case UCS2:
319             tfn = &Transcode;
320             break;
321           case UTF8:
322             tfn = &TranscodeUtf8FromUcs2;
323             break;
324           default:
325             tfn = &TranscodeFromUcs2;
326         }
327         break;
328       default:
329         // This should not happen because of the SupportedEncoding checks
330         ABORT();
331     }
332 
333     result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
334                  input.data(), input.length(), &status);
335   } else {
336     status = U_ILLEGAL_ARGUMENT_ERROR;
337   }
338 
339   if (result.IsEmpty())
340     return args.GetReturnValue().Set(status);
341 
342   return args.GetReturnValue().Set(result.ToLocalChecked());
343 }
344 
ICUErrorName(const FunctionCallbackInfo<Value> & args)345 void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
346   Environment* env = Environment::GetCurrent(args);
347   CHECK(args[0]->IsInt32());
348   UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
349   args.GetReturnValue().Set(
350       String::NewFromUtf8(env->isolate(),
351                           u_errorName(status)).ToLocalChecked());
352 }
353 
354 }  // anonymous namespace
355 
Converter(const char * name,const char * sub)356 Converter::Converter(const char* name, const char* sub) {
357   UErrorCode status = U_ZERO_ERROR;
358   UConverter* conv = ucnv_open(name, &status);
359   CHECK(U_SUCCESS(status));
360   conv_.reset(conv);
361   set_subst_chars(sub);
362 }
363 
Converter(UConverter * converter,const char * sub)364 Converter::Converter(UConverter* converter, const char* sub)
365     : conv_(converter) {
366   set_subst_chars(sub);
367 }
368 
set_subst_chars(const char * sub)369 void Converter::set_subst_chars(const char* sub) {
370   CHECK(conv_);
371   UErrorCode status = U_ZERO_ERROR;
372   if (sub != nullptr) {
373     ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
374     CHECK(U_SUCCESS(status));
375   }
376 }
377 
reset()378 void Converter::reset() {
379   ucnv_reset(conv_.get());
380 }
381 
min_char_size() const382 size_t Converter::min_char_size() const {
383   CHECK(conv_);
384   return ucnv_getMinCharSize(conv_.get());
385 }
386 
max_char_size() const387 size_t Converter::max_char_size() const {
388   CHECK(conv_);
389   return ucnv_getMaxCharSize(conv_.get());
390 }
391 
Has(const FunctionCallbackInfo<Value> & args)392 void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
393   Environment* env = Environment::GetCurrent(args);
394 
395   CHECK_GE(args.Length(), 1);
396   Utf8Value label(env->isolate(), args[0]);
397 
398   UErrorCode status = U_ZERO_ERROR;
399   ConverterPointer conv(ucnv_open(*label, &status));
400   args.GetReturnValue().Set(!!U_SUCCESS(status));
401 }
402 
Create(const FunctionCallbackInfo<Value> & args)403 void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
404   Environment* env = Environment::GetCurrent(args);
405 
406   Local<ObjectTemplate> t = env->i18n_converter_template();
407   Local<Object> obj;
408   if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
409 
410   CHECK_GE(args.Length(), 2);
411   Utf8Value label(env->isolate(), args[0]);
412   int flags = args[1]->Uint32Value(env->context()).ToChecked();
413   bool fatal =
414       (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
415 
416   UErrorCode status = U_ZERO_ERROR;
417   UConverter* conv = ucnv_open(*label, &status);
418   if (U_FAILURE(status))
419     return;
420 
421   if (fatal) {
422     status = U_ZERO_ERROR;
423     ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
424                         nullptr, nullptr, nullptr, &status);
425   }
426 
427   new ConverterObject(env, obj, conv, flags);
428   args.GetReturnValue().Set(obj);
429 }
430 
Decode(const FunctionCallbackInfo<Value> & args)431 void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
432   Environment* env = Environment::GetCurrent(args);
433 
434   CHECK_GE(args.Length(), 3);  // Converter, Buffer, Flags
435 
436   ConverterObject* converter;
437   ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
438   ArrayBufferViewContents<char> input(args[1]);
439   int flags = args[2]->Uint32Value(env->context()).ToChecked();
440 
441   UErrorCode status = U_ZERO_ERROR;
442   MaybeStackBuffer<UChar> result;
443   MaybeLocal<Object> ret;
444 
445   UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
446 
447   // When flushing the final chunk, the limit is the maximum
448   // of either the input buffer length or the number of pending
449   // characters times the min char size.
450   size_t limit = converter->min_char_size() *
451       (!flush ?
452           input.length() :
453           std::max(
454               input.length(),
455               static_cast<size_t>(
456                   ucnv_toUCountPending(converter->conv(), &status))));
457   status = U_ZERO_ERROR;
458 
459   if (limit > 0)
460     result.AllocateSufficientStorage(limit);
461 
462   auto cleanup = OnScopeLeave([&]() {
463     if (flush) {
464       // Reset the converter state.
465       converter->set_bom_seen(false);
466       converter->reset();
467     }
468   });
469 
470   const char* source = input.data();
471   size_t source_length = input.length();
472 
473   UChar* target = *result;
474   ucnv_toUnicode(converter->conv(),
475                  &target,
476                  target + (limit * sizeof(UChar)),
477                  &source,
478                  source + source_length,
479                  nullptr,
480                  flush,
481                  &status);
482 
483   if (U_SUCCESS(status)) {
484     bool omit_initial_bom = false;
485     if (limit > 0) {
486       result.SetLength(target - &result[0]);
487       if (result.length() > 0 &&
488           converter->unicode() &&
489           !converter->ignore_bom() &&
490           !converter->bom_seen()) {
491         // If the very first result in the stream is a BOM, and we are not
492         // explicitly told to ignore it, then we mark it for discarding.
493         if (result[0] == 0xFEFF)
494           omit_initial_bom = true;
495         converter->set_bom_seen(true);
496       }
497     }
498     ret = ToBufferEndian(env, &result);
499     if (omit_initial_bom && !ret.IsEmpty()) {
500       // Perform `ret = ret.slice(2)`.
501       CHECK(ret.ToLocalChecked()->IsUint8Array());
502       Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
503       ret = Buffer::New(env,
504                         orig_ret->Buffer(),
505                         orig_ret->ByteOffset() + 2,
506                         orig_ret->ByteLength() - 2)
507                             .FromMaybe(Local<Uint8Array>());
508     }
509     if (!ret.IsEmpty())
510       args.GetReturnValue().Set(ret.ToLocalChecked());
511     return;
512   }
513 
514   args.GetReturnValue().Set(status);
515 }
516 
ConverterObject(Environment * env,Local<Object> wrap,UConverter * converter,int flags,const char * sub)517 ConverterObject::ConverterObject(
518     Environment* env,
519     Local<Object> wrap,
520     UConverter* converter,
521     int flags,
522     const char* sub)
523     : BaseObject(env, wrap),
524       Converter(converter, sub),
525       flags_(flags) {
526   MakeWeak();
527 
528   switch (ucnv_getType(converter)) {
529     case UCNV_UTF8:
530     case UCNV_UTF16_BigEndian:
531     case UCNV_UTF16_LittleEndian:
532       flags_ |= CONVERTER_FLAGS_UNICODE;
533       break;
534     default: {
535       // Fall through
536     }
537   }
538 }
539 
540 
InitializeICUDirectory(const std::string & path)541 bool InitializeICUDirectory(const std::string& path) {
542   UErrorCode status = U_ZERO_ERROR;
543   if (path.empty()) {
544 #ifdef NODE_HAVE_SMALL_ICU
545     // install the 'small' data.
546     udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
547 #else  // !NODE_HAVE_SMALL_ICU
548     // no small data, so nothing to do.
549 #endif  // !NODE_HAVE_SMALL_ICU
550   } else {
551     u_setDataDirectory(path.c_str());
552     u_init(&status);
553   }
554   return status == U_ZERO_ERROR;
555 }
556 
ToUnicode(MaybeStackBuffer<char> * buf,const char * input,size_t length)557 int32_t ToUnicode(MaybeStackBuffer<char>* buf,
558                   const char* input,
559                   size_t length) {
560   UErrorCode status = U_ZERO_ERROR;
561   uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
562   UIDNA* uidna = uidna_openUTS46(options, &status);
563   if (U_FAILURE(status))
564     return -1;
565   UIDNAInfo info = UIDNA_INFO_INITIALIZER;
566 
567   int32_t len = uidna_nameToUnicodeUTF8(uidna,
568                                         input, length,
569                                         **buf, buf->capacity(),
570                                         &info,
571                                         &status);
572 
573   // Do not check info.errors like we do with ToASCII since ToUnicode always
574   // returns a string, despite any possible errors that may have occurred.
575 
576   if (status == U_BUFFER_OVERFLOW_ERROR) {
577     status = U_ZERO_ERROR;
578     buf->AllocateSufficientStorage(len);
579     len = uidna_nameToUnicodeUTF8(uidna,
580                                   input, length,
581                                   **buf, buf->capacity(),
582                                   &info,
583                                   &status);
584   }
585 
586   // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
587   // string, regardless of whether an error occurred.
588 
589   if (U_FAILURE(status)) {
590     len = -1;
591     buf->SetLength(0);
592   } else {
593     buf->SetLength(len);
594   }
595 
596   uidna_close(uidna);
597   return len;
598 }
599 
ToASCII(MaybeStackBuffer<char> * buf,const char * input,size_t length,enum idna_mode mode)600 int32_t ToASCII(MaybeStackBuffer<char>* buf,
601                 const char* input,
602                 size_t length,
603                 enum idna_mode mode) {
604   UErrorCode status = U_ZERO_ERROR;
605   uint32_t options =                  // CheckHyphens = false; handled later
606     UIDNA_CHECK_BIDI |                // CheckBidi = true
607     UIDNA_CHECK_CONTEXTJ |            // CheckJoiners = true
608     UIDNA_NONTRANSITIONAL_TO_ASCII;   // Nontransitional_Processing
609   if (mode == IDNA_STRICT) {
610     options |= UIDNA_USE_STD3_RULES;  // UseSTD3ASCIIRules = beStrict
611                                       // VerifyDnsLength = beStrict;
612                                       //   handled later
613   }
614 
615   UIDNA* uidna = uidna_openUTS46(options, &status);
616   if (U_FAILURE(status))
617     return -1;
618   UIDNAInfo info = UIDNA_INFO_INITIALIZER;
619 
620   int32_t len = uidna_nameToASCII_UTF8(uidna,
621                                        input, length,
622                                        **buf, buf->capacity(),
623                                        &info,
624                                        &status);
625 
626   if (status == U_BUFFER_OVERFLOW_ERROR) {
627     status = U_ZERO_ERROR;
628     buf->AllocateSufficientStorage(len);
629     len = uidna_nameToASCII_UTF8(uidna,
630                                  input, length,
631                                  **buf, buf->capacity(),
632                                  &info,
633                                  &status);
634   }
635 
636   // In UTS #46 which specifies ToASCII, certain error conditions are
637   // configurable through options, and the WHATWG URL Standard promptly elects
638   // to disable some of them to accommodate for real-world use cases.
639   // Unfortunately, ICU4C's IDNA module does not support disabling some of
640   // these options through `options` above, and thus continues throwing
641   // unnecessary errors. To counter this situation, we just filter out the
642   // errors that may have happened afterwards, before deciding whether to
643   // return an error from this function.
644 
645   // CheckHyphens = false
646   // (Specified in the current UTS #46 draft rev. 18.)
647   // Refs:
648   // - https://github.com/whatwg/url/issues/53
649   // - https://github.com/whatwg/url/pull/309
650   // - http://www.unicode.org/review/pri317/
651   // - http://www.unicode.org/reports/tr46/tr46-18.html
652   // - https://www.icann.org/news/announcement-2000-01-07-en
653   info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
654   info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
655   info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
656 
657   if (mode != IDNA_STRICT) {
658     // VerifyDnsLength = beStrict
659     info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
660     info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
661     info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
662   }
663 
664   if (U_FAILURE(status) || (mode != IDNA_LENIENT && info.errors != 0)) {
665     len = -1;
666     buf->SetLength(0);
667   } else {
668     buf->SetLength(len);
669   }
670 
671   uidna_close(uidna);
672   return len;
673 }
674 
ToUnicode(const FunctionCallbackInfo<Value> & args)675 static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
676   Environment* env = Environment::GetCurrent(args);
677   CHECK_GE(args.Length(), 1);
678   CHECK(args[0]->IsString());
679   Utf8Value val(env->isolate(), args[0]);
680 
681   MaybeStackBuffer<char> buf;
682   int32_t len = ToUnicode(&buf, *val, val.length());
683 
684   if (len < 0) {
685     return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
686   }
687 
688   args.GetReturnValue().Set(
689       String::NewFromUtf8(env->isolate(),
690                           *buf,
691                           NewStringType::kNormal,
692                           len).ToLocalChecked());
693 }
694 
ToASCII(const FunctionCallbackInfo<Value> & args)695 static void ToASCII(const FunctionCallbackInfo<Value>& args) {
696   Environment* env = Environment::GetCurrent(args);
697   CHECK_GE(args.Length(), 1);
698   CHECK(args[0]->IsString());
699   Utf8Value val(env->isolate(), args[0]);
700   // optional arg
701   bool lenient = args[1]->BooleanValue(env->isolate());
702   enum idna_mode mode = lenient ? IDNA_LENIENT : IDNA_DEFAULT;
703 
704   MaybeStackBuffer<char> buf;
705   int32_t len = ToASCII(&buf, *val, val.length(), mode);
706 
707   if (len < 0) {
708     return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
709   }
710 
711   args.GetReturnValue().Set(
712       String::NewFromUtf8(env->isolate(),
713                           *buf,
714                           NewStringType::kNormal,
715                           len).ToLocalChecked());
716 }
717 
718 // This is similar to wcwidth except that it takes the current unicode
719 // character properties database into consideration, allowing it to
720 // correctly calculate the column widths of things like emoji's and
721 // newer wide characters. wcwidth, on the other hand, uses a fixed
722 // algorithm that does not take things like emoji into proper
723 // consideration.
724 //
725 // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
726 // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
727 // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
728 // allow it to be input. Linux's PTY terminal prints control characters as
729 // Narrow rhombi.
730 //
731 // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
732 // consonants are 0-width when combined with initial consonants; otherwise they
733 // are technically Wide. But many terminals (including Konsole and
734 // VTE/GLib-based) implement all medials and finals as 0-width.
735 //
736 // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
737 // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
738 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
GetColumnWidth(UChar32 codepoint,bool ambiguous_as_full_width=false)739 static int GetColumnWidth(UChar32 codepoint,
740                           bool ambiguous_as_full_width = false) {
741   // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
742   // codepoint as being full width, wide, ambiguous, neutral, narrow,
743   // or halfwidth.
744   const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
745   switch (eaw) {
746     case U_EA_FULLWIDTH:
747     case U_EA_WIDE:
748       return 2;
749     case U_EA_AMBIGUOUS:
750       // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
751       if (ambiguous_as_full_width) {
752         return 2;
753       }
754       // If ambiguous_as_full_width is false:
755       // Fall through
756     case U_EA_NEUTRAL:
757       if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
758         return 2;
759       }
760       // Fall through
761     case U_EA_HALFWIDTH:
762     case U_EA_NARROW:
763     default:
764       const auto zero_width_mask = U_GC_CC_MASK |  // C0/C1 control code
765                                   U_GC_CF_MASK |  // Format control character
766                                   U_GC_ME_MASK |  // Enclosing mark
767                                   U_GC_MN_MASK;   // Nonspacing mark
768       if (codepoint != 0x00AD &&  // SOFT HYPHEN is Cf but not zero-width
769           ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
770           u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
771         return 0;
772       }
773       return 1;
774   }
775 }
776 
777 // Returns the column width for the given String.
GetStringWidth(const FunctionCallbackInfo<Value> & args)778 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
779   Environment* env = Environment::GetCurrent(args);
780   CHECK(args[0]->IsString());
781 
782   bool ambiguous_as_full_width = args[1]->IsTrue();
783   bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
784 
785   TwoByteValue value(env->isolate(), args[0]);
786   // reinterpret_cast is required by windows to compile
787   UChar* str = reinterpret_cast<UChar*>(*value);
788   static_assert(sizeof(*str) == sizeof(**value),
789                 "sizeof(*str) == sizeof(**value)");
790   UChar32 c = 0;
791   UChar32 p;
792   size_t n = 0;
793   uint32_t width = 0;
794 
795   while (n < value.length()) {
796     p = c;
797     U16_NEXT(str, n, value.length(), c);
798     // Don't count individual emoji codepoints that occur within an
799     // emoji sequence. This is not necessarily foolproof. Some
800     // environments display emoji sequences in the appropriate
801     // condensed form (as a single emoji glyph), other environments
802     // may not understand an emoji sequence and will display each
803     // individual emoji separately. When this happens, the width
804     // calculated will be off, and there's no reliable way of knowing
805     // in advance if a particular sequence is going to be supported.
806     // The expand_emoji_sequence option allows the caller to skip this
807     // check and count each code within an emoji sequence separately.
808     // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
809     if (!expand_emoji_sequence &&
810         n > 0 && p == 0x200d &&  // 0x200d == ZWJ (zero width joiner)
811         (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
812          u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
813       continue;
814     }
815     width += GetColumnWidth(c, ambiguous_as_full_width);
816   }
817   args.GetReturnValue().Set(width);
818 }
819 
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)820 void Initialize(Local<Object> target,
821                 Local<Value> unused,
822                 Local<Context> context,
823                 void* priv) {
824   Environment* env = Environment::GetCurrent(context);
825   env->SetMethod(target, "toUnicode", ToUnicode);
826   env->SetMethod(target, "toASCII", ToASCII);
827   env->SetMethod(target, "getStringWidth", GetStringWidth);
828 
829   // One-shot converters
830   env->SetMethod(target, "icuErrName", ICUErrorName);
831   env->SetMethod(target, "transcode", Transcode);
832 
833   // ConverterObject
834   {
835     Local<FunctionTemplate> t = FunctionTemplate::New(env->isolate());
836     t->Inherit(BaseObject::GetConstructorTemplate(env));
837     t->InstanceTemplate()->SetInternalFieldCount(
838         ConverterObject::kInternalFieldCount);
839     Local<String> converter_string =
840         FIXED_ONE_BYTE_STRING(env->isolate(), "Converter");
841     t->SetClassName(converter_string);
842     env->set_i18n_converter_template(t->InstanceTemplate());
843   }
844 
845   env->SetMethod(target, "getConverter", ConverterObject::Create);
846   env->SetMethod(target, "decode", ConverterObject::Decode);
847   env->SetMethod(target, "hasConverter", ConverterObject::Has);
848 }
849 
850 }  // namespace i18n
851 }  // namespace node
852 
853 NODE_MODULE_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize)
854 
855 #endif  // NODE_HAVE_I18N_SUPPORT
856