• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright Joyent, Inc. and other Node contributors.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a
4 // copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to permit
8 // persons to whom the Software is furnished to do so, subject to the
9 // following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included
12 // in all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
21 
22 /*
23  * notes: by srl295
24  *  - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25  *     ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26  *    linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27  *    macro names. That's the "english+root" data.
28  *
29  *    If icu_data_path is non-null, the user has provided a path and we assume
30  *    it goes somewhere useful. We set that path in ICU, and exit.
31  *    If icu_data_path is null, they haven't set a path and we want the
32  *    "english+root" data.  We call
33  *       udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34  *    to load up the english+root data.
35  *
36  *  - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37  *    data. All of the variables and command line options for changing data at
38  *    runtime are disabled, as they wouldn't fully override the internal data.
39  *    See:  http://bugs.icu-project.org/trac/ticket/10924
40  */
41 
42 
43 #include "node_i18n.h"
44 
45 #if defined(NODE_HAVE_I18N_SUPPORT)
46 
47 #include "base_object-inl.h"
48 #include "node.h"
49 #include "node_buffer.h"
50 #include "node_errors.h"
51 #include "node_internals.h"
52 #include "util-inl.h"
53 #include "v8.h"
54 
55 #include <unicode/utypes.h>
56 #include <unicode/putil.h>
57 #include <unicode/uchar.h>
58 #include <unicode/uclean.h>
59 #include <unicode/udata.h>
60 #include <unicode/uidna.h>
61 #include <unicode/ucnv.h>
62 #include <unicode/utf8.h>
63 #include <unicode/utf16.h>
64 #include <unicode/timezone.h>
65 #include <unicode/ulocdata.h>
66 #include <unicode/uvernum.h>
67 #include <unicode/uversion.h>
68 #include <unicode/ustring.h>
69 
70 #ifdef NODE_HAVE_SMALL_ICU
71 /* if this is defined, we have a 'secondary' entry point.
72    compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
73 #define SMALL_ICUDATA_ENTRY_POINT \
74   SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
75 #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
76 #ifndef U_LIB_SUFFIX_C_NAME
77 #define SMALL_DEF(major, suff) icusmdt##major##_dat
78 #else
79 #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
80 #endif
81 
82 extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
83 #endif
84 
85 namespace node {
86 
87 using v8::Context;
88 using v8::FunctionCallbackInfo;
89 using v8::FunctionTemplate;
90 using v8::Int32;
91 using v8::Isolate;
92 using v8::Local;
93 using v8::MaybeLocal;
94 using v8::NewStringType;
95 using v8::Object;
96 using v8::ObjectTemplate;
97 using v8::String;
98 using v8::Uint8Array;
99 using v8::Value;
100 
101 namespace i18n {
102 namespace {
103 
104 template <typename T>
ToBufferEndian(Environment * env,MaybeStackBuffer<T> * buf)105 MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
106   MaybeLocal<Object> ret = Buffer::New(env, buf);
107   if (ret.IsEmpty())
108     return ret;
109 
110   static_assert(sizeof(T) == 1 || sizeof(T) == 2,
111                 "Currently only one- or two-byte buffers are supported");
112   if (sizeof(T) > 1 && IsBigEndian()) {
113     SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
114     SwapBytes16(retbuf_data, retbuf_length);
115   }
116 
117   return ret;
118 }
119 
120 // One-Shot Converters
121 
CopySourceBuffer(MaybeStackBuffer<UChar> * dest,const char * data,const size_t length,const size_t length_in_chars)122 void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
123                       const char* data,
124                       const size_t length,
125                       const size_t length_in_chars) {
126   dest->AllocateSufficientStorage(length_in_chars);
127   char* dst = reinterpret_cast<char*>(**dest);
128   memcpy(dst, data, length);
129   if (IsBigEndian()) {
130     SwapBytes16(dst, length);
131   }
132 }
133 
134 typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
135                                             const char* fromEncoding,
136                                             const char* toEncoding,
137                                             const char* source,
138                                             const size_t source_length,
139                                             UErrorCode* status);
140 
Transcode(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)141 MaybeLocal<Object> Transcode(Environment* env,
142                              const char* fromEncoding,
143                              const char* toEncoding,
144                              const char* source,
145                              const size_t source_length,
146                              UErrorCode* status) {
147   *status = U_ZERO_ERROR;
148   MaybeLocal<Object> ret;
149   MaybeStackBuffer<char> result;
150   Converter to(toEncoding, "?");
151   Converter from(fromEncoding);
152   const uint32_t limit = source_length * to.max_char_size();
153   result.AllocateSufficientStorage(limit);
154   char* target = *result;
155   ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
156                  &source, source + source_length, nullptr, nullptr,
157                  nullptr, nullptr, true, true, status);
158   if (U_SUCCESS(*status)) {
159     result.SetLength(target - &result[0]);
160     ret = ToBufferEndian(env, &result);
161   }
162   return ret;
163 }
164 
TranscodeToUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)165 MaybeLocal<Object> TranscodeToUcs2(Environment* env,
166                                    const char* fromEncoding,
167                                    const char* toEncoding,
168                                    const char* source,
169                                    const size_t source_length,
170                                    UErrorCode* status) {
171   *status = U_ZERO_ERROR;
172   MaybeLocal<Object> ret;
173   MaybeStackBuffer<UChar> destbuf(source_length);
174   Converter from(fromEncoding);
175   const size_t length_in_chars = source_length * sizeof(UChar);
176   ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
177                 source, source_length, status);
178   if (U_SUCCESS(*status))
179     ret = ToBufferEndian(env, &destbuf);
180   return ret;
181 }
182 
TranscodeFromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)183 MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
184                                      const char* fromEncoding,
185                                      const char* toEncoding,
186                                      const char* source,
187                                      const size_t source_length,
188                                      UErrorCode* status) {
189   *status = U_ZERO_ERROR;
190   MaybeStackBuffer<UChar> sourcebuf;
191   MaybeLocal<Object> ret;
192   Converter to(toEncoding, "?");
193   const size_t length_in_chars = source_length / sizeof(UChar);
194   CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
195   MaybeStackBuffer<char> destbuf(length_in_chars);
196   const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
197                                        *sourcebuf, length_in_chars, status);
198   if (U_SUCCESS(*status)) {
199     destbuf.SetLength(len);
200     ret = ToBufferEndian(env, &destbuf);
201   }
202   return ret;
203 }
204 
TranscodeUcs2FromUtf8(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)205 MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
206                                          const char* fromEncoding,
207                                          const char* toEncoding,
208                                          const char* source,
209                                          const size_t source_length,
210                                          UErrorCode* status) {
211   *status = U_ZERO_ERROR;
212   MaybeStackBuffer<UChar> destbuf;
213   int32_t result_length;
214   u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
215                 source, source_length, status);
216   MaybeLocal<Object> ret;
217   if (U_SUCCESS(*status)) {
218     destbuf.SetLength(result_length);
219     ret = ToBufferEndian(env, &destbuf);
220   } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
221     *status = U_ZERO_ERROR;
222     destbuf.AllocateSufficientStorage(result_length);
223     u_strFromUTF8(*destbuf, result_length, &result_length,
224                   source, source_length, status);
225     if (U_SUCCESS(*status)) {
226       destbuf.SetLength(result_length);
227       ret = ToBufferEndian(env, &destbuf);
228     }
229   }
230   return ret;
231 }
232 
TranscodeUtf8FromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)233 MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
234                                          const char* fromEncoding,
235                                          const char* toEncoding,
236                                          const char* source,
237                                          const size_t source_length,
238                                          UErrorCode* status) {
239   *status = U_ZERO_ERROR;
240   MaybeLocal<Object> ret;
241   const size_t length_in_chars = source_length / sizeof(UChar);
242   int32_t result_length;
243   MaybeStackBuffer<UChar> sourcebuf;
244   MaybeStackBuffer<char> destbuf;
245   CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
246   u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
247               *sourcebuf, length_in_chars, status);
248   if (U_SUCCESS(*status)) {
249     destbuf.SetLength(result_length);
250     ret = ToBufferEndian(env, &destbuf);
251   } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
252     *status = U_ZERO_ERROR;
253     destbuf.AllocateSufficientStorage(result_length);
254     u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
255                 length_in_chars, status);
256     if (U_SUCCESS(*status)) {
257       destbuf.SetLength(result_length);
258       ret = ToBufferEndian(env, &destbuf);
259     }
260   }
261   return ret;
262 }
263 
EncodingName(const enum encoding encoding)264 const char* EncodingName(const enum encoding encoding) {
265   switch (encoding) {
266     case ASCII: return "us-ascii";
267     case LATIN1: return "iso8859-1";
268     case UCS2: return "utf16le";
269     case UTF8: return "utf-8";
270     default: return nullptr;
271   }
272 }
273 
SupportedEncoding(const enum encoding encoding)274 bool SupportedEncoding(const enum encoding encoding) {
275   switch (encoding) {
276     case ASCII:
277     case LATIN1:
278     case UCS2:
279     case UTF8: return true;
280     default: return false;
281   }
282 }
283 
Transcode(const FunctionCallbackInfo<Value> & args)284 void Transcode(const FunctionCallbackInfo<Value>&args) {
285   Environment* env = Environment::GetCurrent(args);
286   Isolate* isolate = env->isolate();
287   UErrorCode status = U_ZERO_ERROR;
288   MaybeLocal<Object> result;
289 
290   ArrayBufferViewContents<char> input(args[0]);
291   const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
292   const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
293 
294   if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
295     TranscodeFunc tfn = &Transcode;
296     switch (fromEncoding) {
297       case ASCII:
298       case LATIN1:
299         if (toEncoding == UCS2)
300           tfn = &TranscodeToUcs2;
301         break;
302       case UTF8:
303         if (toEncoding == UCS2)
304           tfn = &TranscodeUcs2FromUtf8;
305         break;
306       case UCS2:
307         switch (toEncoding) {
308           case UCS2:
309             tfn = &Transcode;
310             break;
311           case UTF8:
312             tfn = &TranscodeUtf8FromUcs2;
313             break;
314           default:
315             tfn = &TranscodeFromUcs2;
316         }
317         break;
318       default:
319         // This should not happen because of the SupportedEncoding checks
320         ABORT();
321     }
322 
323     result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
324                  input.data(), input.length(), &status);
325   } else {
326     status = U_ILLEGAL_ARGUMENT_ERROR;
327   }
328 
329   if (result.IsEmpty())
330     return args.GetReturnValue().Set(status);
331 
332   return args.GetReturnValue().Set(result.ToLocalChecked());
333 }
334 
ICUErrorName(const FunctionCallbackInfo<Value> & args)335 void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
336   Environment* env = Environment::GetCurrent(args);
337   CHECK(args[0]->IsInt32());
338   UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
339   args.GetReturnValue().Set(
340       String::NewFromUtf8(env->isolate(),
341                           u_errorName(status),
342                           NewStringType::kNormal).ToLocalChecked());
343 }
344 
345 }  // anonymous namespace
346 
Converter(const char * name,const char * sub)347 Converter::Converter(const char* name, const char* sub) {
348   UErrorCode status = U_ZERO_ERROR;
349   UConverter* conv = ucnv_open(name, &status);
350   CHECK(U_SUCCESS(status));
351   conv_.reset(conv);
352   set_subst_chars(sub);
353 }
354 
Converter(UConverter * converter,const char * sub)355 Converter::Converter(UConverter* converter, const char* sub)
356     : conv_(converter) {
357   set_subst_chars(sub);
358 }
359 
set_subst_chars(const char * sub)360 void Converter::set_subst_chars(const char* sub) {
361   CHECK(conv_);
362   UErrorCode status = U_ZERO_ERROR;
363   if (sub != nullptr) {
364     ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
365     CHECK(U_SUCCESS(status));
366   }
367 }
368 
reset()369 void Converter::reset() {
370   ucnv_reset(conv_.get());
371 }
372 
min_char_size() const373 size_t Converter::min_char_size() const {
374   CHECK(conv_);
375   return ucnv_getMinCharSize(conv_.get());
376 }
377 
max_char_size() const378 size_t Converter::max_char_size() const {
379   CHECK(conv_);
380   return ucnv_getMaxCharSize(conv_.get());
381 }
382 
Has(const FunctionCallbackInfo<Value> & args)383 void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
384   Environment* env = Environment::GetCurrent(args);
385 
386   CHECK_GE(args.Length(), 1);
387   Utf8Value label(env->isolate(), args[0]);
388 
389   UErrorCode status = U_ZERO_ERROR;
390   ConverterPointer conv(ucnv_open(*label, &status));
391   args.GetReturnValue().Set(!!U_SUCCESS(status));
392 }
393 
Create(const FunctionCallbackInfo<Value> & args)394 void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
395   Environment* env = Environment::GetCurrent(args);
396 
397   Local<ObjectTemplate> t = env->i18n_converter_template();
398   Local<Object> obj;
399   if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
400 
401   CHECK_GE(args.Length(), 2);
402   Utf8Value label(env->isolate(), args[0]);
403   int flags = args[1]->Uint32Value(env->context()).ToChecked();
404   bool fatal =
405       (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
406 
407   UErrorCode status = U_ZERO_ERROR;
408   UConverter* conv = ucnv_open(*label, &status);
409   if (U_FAILURE(status))
410     return;
411 
412   if (fatal) {
413     status = U_ZERO_ERROR;
414     ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
415                         nullptr, nullptr, nullptr, &status);
416   }
417 
418   new ConverterObject(env, obj, conv, flags);
419   args.GetReturnValue().Set(obj);
420 }
421 
Decode(const FunctionCallbackInfo<Value> & args)422 void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
423   Environment* env = Environment::GetCurrent(args);
424 
425   CHECK_GE(args.Length(), 3);  // Converter, Buffer, Flags
426 
427   ConverterObject* converter;
428   ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
429   ArrayBufferViewContents<char> input(args[1]);
430   int flags = args[2]->Uint32Value(env->context()).ToChecked();
431 
432   UErrorCode status = U_ZERO_ERROR;
433   MaybeStackBuffer<UChar> result;
434   MaybeLocal<Object> ret;
435   size_t limit = converter->min_char_size() * input.length();
436   if (limit > 0)
437     result.AllocateSufficientStorage(limit);
438 
439   UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
440   auto cleanup = OnScopeLeave([&]() {
441     if (flush) {
442       // Reset the converter state.
443       converter->set_bom_seen(false);
444       converter->reset();
445     }
446   });
447 
448   const char* source = input.data();
449   size_t source_length = input.length();
450 
451   UChar* target = *result;
452   ucnv_toUnicode(converter->conv(),
453                  &target,
454                  target + (limit * sizeof(UChar)),
455                  &source,
456                  source + source_length,
457                  nullptr,
458                  flush,
459                  &status);
460 
461   if (U_SUCCESS(status)) {
462     bool omit_initial_bom = false;
463     if (limit > 0) {
464       result.SetLength(target - &result[0]);
465       if (result.length() > 0 &&
466           converter->unicode() &&
467           !converter->ignore_bom() &&
468           !converter->bom_seen()) {
469         // If the very first result in the stream is a BOM, and we are not
470         // explicitly told to ignore it, then we mark it for discarding.
471         if (result[0] == 0xFEFF)
472           omit_initial_bom = true;
473         converter->set_bom_seen(true);
474       }
475     }
476     ret = ToBufferEndian(env, &result);
477     if (omit_initial_bom && !ret.IsEmpty()) {
478       // Peform `ret = ret.slice(2)`.
479       CHECK(ret.ToLocalChecked()->IsUint8Array());
480       Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
481       ret = Buffer::New(env,
482                         orig_ret->Buffer(),
483                         orig_ret->ByteOffset() + 2,
484                         orig_ret->ByteLength() - 2)
485                             .FromMaybe(Local<Uint8Array>());
486     }
487     if (!ret.IsEmpty())
488       args.GetReturnValue().Set(ret.ToLocalChecked());
489     return;
490   }
491 
492   args.GetReturnValue().Set(status);
493 }
494 
ConverterObject(Environment * env,Local<Object> wrap,UConverter * converter,int flags,const char * sub)495 ConverterObject::ConverterObject(
496     Environment* env,
497     Local<Object> wrap,
498     UConverter* converter,
499     int flags,
500     const char* sub)
501     : BaseObject(env, wrap),
502       Converter(converter, sub),
503       flags_(flags) {
504   MakeWeak();
505 
506   switch (ucnv_getType(converter)) {
507     case UCNV_UTF8:
508     case UCNV_UTF16_BigEndian:
509     case UCNV_UTF16_LittleEndian:
510       flags_ |= CONVERTER_FLAGS_UNICODE;
511       break;
512     default: {
513       // Fall through
514     }
515   }
516 }
517 
518 
InitializeICUDirectory(const std::string & path)519 bool InitializeICUDirectory(const std::string& path) {
520   UErrorCode status = U_ZERO_ERROR;
521   if (path.empty()) {
522 #ifdef NODE_HAVE_SMALL_ICU
523     // install the 'small' data.
524     udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
525 #else  // !NODE_HAVE_SMALL_ICU
526     // no small data, so nothing to do.
527 #endif  // !NODE_HAVE_SMALL_ICU
528   } else {
529     u_setDataDirectory(path.c_str());
530     u_init(&status);
531   }
532   return status == U_ZERO_ERROR;
533 }
534 
ToUnicode(MaybeStackBuffer<char> * buf,const char * input,size_t length)535 int32_t ToUnicode(MaybeStackBuffer<char>* buf,
536                   const char* input,
537                   size_t length) {
538   UErrorCode status = U_ZERO_ERROR;
539   uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
540   UIDNA* uidna = uidna_openUTS46(options, &status);
541   if (U_FAILURE(status))
542     return -1;
543   UIDNAInfo info = UIDNA_INFO_INITIALIZER;
544 
545   int32_t len = uidna_nameToUnicodeUTF8(uidna,
546                                         input, length,
547                                         **buf, buf->capacity(),
548                                         &info,
549                                         &status);
550 
551   // Do not check info.errors like we do with ToASCII since ToUnicode always
552   // returns a string, despite any possible errors that may have occurred.
553 
554   if (status == U_BUFFER_OVERFLOW_ERROR) {
555     status = U_ZERO_ERROR;
556     buf->AllocateSufficientStorage(len);
557     len = uidna_nameToUnicodeUTF8(uidna,
558                                   input, length,
559                                   **buf, buf->capacity(),
560                                   &info,
561                                   &status);
562   }
563 
564   // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
565   // string, regardless of whether an error occurred.
566 
567   if (U_FAILURE(status)) {
568     len = -1;
569     buf->SetLength(0);
570   } else {
571     buf->SetLength(len);
572   }
573 
574   uidna_close(uidna);
575   return len;
576 }
577 
ToASCII(MaybeStackBuffer<char> * buf,const char * input,size_t length,enum idna_mode mode)578 int32_t ToASCII(MaybeStackBuffer<char>* buf,
579                 const char* input,
580                 size_t length,
581                 enum idna_mode mode) {
582   UErrorCode status = U_ZERO_ERROR;
583   uint32_t options =                  // CheckHyphens = false; handled later
584     UIDNA_CHECK_BIDI |                // CheckBidi = true
585     UIDNA_CHECK_CONTEXTJ |            // CheckJoiners = true
586     UIDNA_NONTRANSITIONAL_TO_ASCII;   // Nontransitional_Processing
587   if (mode == IDNA_STRICT) {
588     options |= UIDNA_USE_STD3_RULES;  // UseSTD3ASCIIRules = beStrict
589                                       // VerifyDnsLength = beStrict;
590                                       //   handled later
591   }
592 
593   UIDNA* uidna = uidna_openUTS46(options, &status);
594   if (U_FAILURE(status))
595     return -1;
596   UIDNAInfo info = UIDNA_INFO_INITIALIZER;
597 
598   int32_t len = uidna_nameToASCII_UTF8(uidna,
599                                        input, length,
600                                        **buf, buf->capacity(),
601                                        &info,
602                                        &status);
603 
604   if (status == U_BUFFER_OVERFLOW_ERROR) {
605     status = U_ZERO_ERROR;
606     buf->AllocateSufficientStorage(len);
607     len = uidna_nameToASCII_UTF8(uidna,
608                                  input, length,
609                                  **buf, buf->capacity(),
610                                  &info,
611                                  &status);
612   }
613 
614   // In UTS #46 which specifies ToASCII, certain error conditions are
615   // configurable through options, and the WHATWG URL Standard promptly elects
616   // to disable some of them to accommodate for real-world use cases.
617   // Unfortunately, ICU4C's IDNA module does not support disabling some of
618   // these options through `options` above, and thus continues throwing
619   // unnecessary errors. To counter this situation, we just filter out the
620   // errors that may have happened afterwards, before deciding whether to
621   // return an error from this function.
622 
623   // CheckHyphens = false
624   // (Specified in the current UTS #46 draft rev. 18.)
625   // Refs:
626   // - https://github.com/whatwg/url/issues/53
627   // - https://github.com/whatwg/url/pull/309
628   // - http://www.unicode.org/review/pri317/
629   // - http://www.unicode.org/reports/tr46/tr46-18.html
630   // - https://www.icann.org/news/announcement-2000-01-07-en
631   info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
632   info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
633   info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
634 
635   if (mode != IDNA_STRICT) {
636     // VerifyDnsLength = beStrict
637     info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
638     info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
639     info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
640   }
641 
642   if (U_FAILURE(status) || (mode != IDNA_LENIENT && info.errors != 0)) {
643     len = -1;
644     buf->SetLength(0);
645   } else {
646     buf->SetLength(len);
647   }
648 
649   uidna_close(uidna);
650   return len;
651 }
652 
ToUnicode(const FunctionCallbackInfo<Value> & args)653 static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
654   Environment* env = Environment::GetCurrent(args);
655   CHECK_GE(args.Length(), 1);
656   CHECK(args[0]->IsString());
657   Utf8Value val(env->isolate(), args[0]);
658 
659   MaybeStackBuffer<char> buf;
660   int32_t len = ToUnicode(&buf, *val, val.length());
661 
662   if (len < 0) {
663     return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
664   }
665 
666   args.GetReturnValue().Set(
667       String::NewFromUtf8(env->isolate(),
668                           *buf,
669                           NewStringType::kNormal,
670                           len).ToLocalChecked());
671 }
672 
ToASCII(const FunctionCallbackInfo<Value> & args)673 static void ToASCII(const FunctionCallbackInfo<Value>& args) {
674   Environment* env = Environment::GetCurrent(args);
675   CHECK_GE(args.Length(), 1);
676   CHECK(args[0]->IsString());
677   Utf8Value val(env->isolate(), args[0]);
678   // optional arg
679   bool lenient = args[1]->BooleanValue(env->isolate());
680   enum idna_mode mode = lenient ? IDNA_LENIENT : IDNA_DEFAULT;
681 
682   MaybeStackBuffer<char> buf;
683   int32_t len = ToASCII(&buf, *val, val.length(), mode);
684 
685   if (len < 0) {
686     return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
687   }
688 
689   args.GetReturnValue().Set(
690       String::NewFromUtf8(env->isolate(),
691                           *buf,
692                           NewStringType::kNormal,
693                           len).ToLocalChecked());
694 }
695 
696 // This is similar to wcwidth except that it takes the current unicode
697 // character properties database into consideration, allowing it to
698 // correctly calculate the column widths of things like emoji's and
699 // newer wide characters. wcwidth, on the other hand, uses a fixed
700 // algorithm that does not take things like emoji into proper
701 // consideration.
702 //
703 // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
704 // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
705 // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
706 // allow it to be input. Linux's PTY terminal prints control characters as
707 // Narrow rhombi.
708 //
709 // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
710 // consonants are 0-width when combined with initial consonants; otherwise they
711 // are technically Wide. But many terminals (including Konsole and
712 // VTE/GLib-based) implement all medials and finals as 0-width.
713 //
714 // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
715 // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
716 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
GetColumnWidth(UChar32 codepoint,bool ambiguous_as_full_width=false)717 static int GetColumnWidth(UChar32 codepoint,
718                           bool ambiguous_as_full_width = false) {
719   // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
720   // codepoint as being full width, wide, ambiguous, neutral, narrow,
721   // or halfwidth.
722   const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
723   switch (eaw) {
724     case U_EA_FULLWIDTH:
725     case U_EA_WIDE:
726       return 2;
727     case U_EA_AMBIGUOUS:
728       // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
729       if (ambiguous_as_full_width) {
730         return 2;
731       }
732       // If ambiguous_as_full_width is false:
733       // Fall through
734     case U_EA_NEUTRAL:
735       if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
736         return 2;
737       }
738       // Fall through
739     case U_EA_HALFWIDTH:
740     case U_EA_NARROW:
741     default:
742       const auto zero_width_mask = U_GC_CC_MASK |  // C0/C1 control code
743                                   U_GC_CF_MASK |  // Format control character
744                                   U_GC_ME_MASK |  // Enclosing mark
745                                   U_GC_MN_MASK;   // Nonspacing mark
746       if (codepoint != 0x00AD &&  // SOFT HYPHEN is Cf but not zero-width
747           ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
748           u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
749         return 0;
750       }
751       return 1;
752   }
753 }
754 
755 // Returns the column width for the given String.
GetStringWidth(const FunctionCallbackInfo<Value> & args)756 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
757   Environment* env = Environment::GetCurrent(args);
758   CHECK(args[0]->IsString());
759 
760   bool ambiguous_as_full_width = args[1]->IsTrue();
761   bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
762 
763   TwoByteValue value(env->isolate(), args[0]);
764   // reinterpret_cast is required by windows to compile
765   UChar* str = reinterpret_cast<UChar*>(*value);
766   static_assert(sizeof(*str) == sizeof(**value),
767                 "sizeof(*str) == sizeof(**value)");
768   UChar32 c = 0;
769   UChar32 p;
770   size_t n = 0;
771   uint32_t width = 0;
772 
773   while (n < value.length()) {
774     p = c;
775     U16_NEXT(str, n, value.length(), c);
776     // Don't count individual emoji codepoints that occur within an
777     // emoji sequence. This is not necessarily foolproof. Some
778     // environments display emoji sequences in the appropriate
779     // condensed form (as a single emoji glyph), other environments
780     // may not understand an emoji sequence and will display each
781     // individual emoji separately. When this happens, the width
782     // calculated will be off, and there's no reliable way of knowing
783     // in advance if a particular sequence is going to be supported.
784     // The expand_emoji_sequence option allows the caller to skip this
785     // check and count each code within an emoji sequence separately.
786     // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
787     if (!expand_emoji_sequence &&
788         n > 0 && p == 0x200d &&  // 0x200d == ZWJ (zero width joiner)
789         (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
790          u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
791       continue;
792     }
793     width += GetColumnWidth(c, ambiguous_as_full_width);
794   }
795   args.GetReturnValue().Set(width);
796 }
797 
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)798 void Initialize(Local<Object> target,
799                 Local<Value> unused,
800                 Local<Context> context,
801                 void* priv) {
802   Environment* env = Environment::GetCurrent(context);
803   env->SetMethod(target, "toUnicode", ToUnicode);
804   env->SetMethod(target, "toASCII", ToASCII);
805   env->SetMethod(target, "getStringWidth", GetStringWidth);
806 
807   // One-shot converters
808   env->SetMethod(target, "icuErrName", ICUErrorName);
809   env->SetMethod(target, "transcode", Transcode);
810 
811   // ConverterObject
812   {
813     Local<FunctionTemplate> t = FunctionTemplate::New(env->isolate());
814     t->Inherit(BaseObject::GetConstructorTemplate(env));
815     t->InstanceTemplate()->SetInternalFieldCount(
816         ConverterObject::kInternalFieldCount);
817     Local<String> converter_string =
818         FIXED_ONE_BYTE_STRING(env->isolate(), "Converter");
819     t->SetClassName(converter_string);
820     env->set_i18n_converter_template(t->InstanceTemplate());
821   }
822 
823   env->SetMethod(target, "getConverter", ConverterObject::Create);
824   env->SetMethod(target, "decode", ConverterObject::Decode);
825   env->SetMethod(target, "hasConverter", ConverterObject::Has);
826 }
827 
828 }  // namespace i18n
829 }  // namespace node
830 
831 NODE_MODULE_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize)
832 
833 #endif  // NODE_HAVE_I18N_SUPPORT
834