• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright Joyent, Inc. and other Node contributors.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a
4 // copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to permit
8 // persons to whom the Software is furnished to do so, subject to the
9 // following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included
12 // in all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
21 
22 /*
23  * notes: by srl295
24  *  - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25  *     ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26  *    linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27  *    macro names. That's the "english+root" data.
28  *
29  *    If icu_data_path is non-null, the user has provided a path and we assume
30  *    it goes somewhere useful. We set that path in ICU, and exit.
31  *    If icu_data_path is null, they haven't set a path and we want the
32  *    "english+root" data.  We call
33  *       udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34  *    to load up the english+root data.
35  *
36  *  - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37  *    data. All of the variables and command line options for changing data at
38  *    runtime are disabled, as they wouldn't fully override the internal data.
39  *    See:  http://bugs.icu-project.org/trac/ticket/10924
40  */
41 
42 
43 #include "node_i18n.h"
44 #include "node_external_reference.h"
45 
46 #if defined(NODE_HAVE_I18N_SUPPORT)
47 
48 #include "base_object-inl.h"
49 #include "node.h"
50 #include "node_buffer.h"
51 #include "node_errors.h"
52 #include "node_internals.h"
53 #include "string_bytes.h"
54 #include "util-inl.h"
55 #include "v8.h"
56 
57 #include <unicode/utypes.h>
58 #include <unicode/putil.h>
59 #include <unicode/uchar.h>
60 #include <unicode/uclean.h>
61 #include <unicode/udata.h>
62 #include <unicode/uidna.h>
63 #include <unicode/ucnv.h>
64 #include <unicode/utf8.h>
65 #include <unicode/utf16.h>
66 #include <unicode/timezone.h>
67 #include <unicode/ulocdata.h>
68 #include <unicode/uvernum.h>
69 #include <unicode/uversion.h>
70 #include <unicode/ustring.h>
71 
72 #ifdef NODE_HAVE_SMALL_ICU
73 /* if this is defined, we have a 'secondary' entry point.
74    compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
75 #define SMALL_ICUDATA_ENTRY_POINT \
76   SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
77 #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
78 #ifndef U_LIB_SUFFIX_C_NAME
79 #define SMALL_DEF(major, suff) icusmdt##major##_dat
80 #else
81 #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
82 #endif
83 
84 extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
85 #endif
86 
87 namespace node {
88 
89 using v8::Context;
90 using v8::FunctionCallbackInfo;
91 using v8::FunctionTemplate;
92 using v8::Int32;
93 using v8::Isolate;
94 using v8::Local;
95 using v8::MaybeLocal;
96 using v8::NewStringType;
97 using v8::Object;
98 using v8::ObjectTemplate;
99 using v8::String;
100 using v8::Value;
101 
102 namespace i18n {
103 namespace {
104 
105 template <typename T>
ToBufferEndian(Environment * env,MaybeStackBuffer<T> * buf)106 MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
107   MaybeLocal<Object> ret = Buffer::New(env, buf);
108   if (ret.IsEmpty())
109     return ret;
110 
111   static_assert(sizeof(T) == 1 || sizeof(T) == 2,
112                 "Currently only one- or two-byte buffers are supported");
113   if (sizeof(T) > 1 && IsBigEndian()) {
114     SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
115     SwapBytes16(retbuf_data, retbuf_length);
116   }
117 
118   return ret;
119 }
120 
121 // One-Shot Converters
122 
CopySourceBuffer(MaybeStackBuffer<UChar> * dest,const char * data,const size_t length,const size_t length_in_chars)123 void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
124                       const char* data,
125                       const size_t length,
126                       const size_t length_in_chars) {
127   dest->AllocateSufficientStorage(length_in_chars);
128   char* dst = reinterpret_cast<char*>(**dest);
129   memcpy(dst, data, length);
130   if (IsBigEndian()) {
131     SwapBytes16(dst, length);
132   }
133 }
134 
135 typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
136                                             const char* fromEncoding,
137                                             const char* toEncoding,
138                                             const char* source,
139                                             const size_t source_length,
140                                             UErrorCode* status);
141 
Transcode(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)142 MaybeLocal<Object> Transcode(Environment* env,
143                              const char* fromEncoding,
144                              const char* toEncoding,
145                              const char* source,
146                              const size_t source_length,
147                              UErrorCode* status) {
148   *status = U_ZERO_ERROR;
149   MaybeLocal<Object> ret;
150   MaybeStackBuffer<char> result;
151   Converter to(toEncoding);
152   Converter from(fromEncoding);
153 
154   size_t sublen = ucnv_getMinCharSize(to.conv());
155   std::string sub(sublen, '?');
156   to.set_subst_chars(sub.c_str());
157 
158   const uint32_t limit = source_length * to.max_char_size();
159   result.AllocateSufficientStorage(limit);
160   char* target = *result;
161   ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
162                  &source, source + source_length, nullptr, nullptr,
163                  nullptr, nullptr, true, true, status);
164   if (U_SUCCESS(*status)) {
165     result.SetLength(target - &result[0]);
166     ret = ToBufferEndian(env, &result);
167   }
168   return ret;
169 }
170 
TranscodeToUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)171 MaybeLocal<Object> TranscodeToUcs2(Environment* env,
172                                    const char* fromEncoding,
173                                    const char* toEncoding,
174                                    const char* source,
175                                    const size_t source_length,
176                                    UErrorCode* status) {
177   *status = U_ZERO_ERROR;
178   MaybeLocal<Object> ret;
179   MaybeStackBuffer<UChar> destbuf(source_length);
180   Converter from(fromEncoding);
181   const size_t length_in_chars = source_length * sizeof(UChar);
182   ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
183                 source, source_length, status);
184   if (U_SUCCESS(*status))
185     ret = ToBufferEndian(env, &destbuf);
186   return ret;
187 }
188 
TranscodeFromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)189 MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
190                                      const char* fromEncoding,
191                                      const char* toEncoding,
192                                      const char* source,
193                                      const size_t source_length,
194                                      UErrorCode* status) {
195   *status = U_ZERO_ERROR;
196   MaybeStackBuffer<UChar> sourcebuf;
197   MaybeLocal<Object> ret;
198   Converter to(toEncoding);
199 
200   size_t sublen = ucnv_getMinCharSize(to.conv());
201   std::string sub(sublen, '?');
202   to.set_subst_chars(sub.c_str());
203 
204   const size_t length_in_chars = source_length / sizeof(UChar);
205   CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
206   MaybeStackBuffer<char> destbuf(length_in_chars);
207   const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
208                                        *sourcebuf, length_in_chars, status);
209   if (U_SUCCESS(*status)) {
210     destbuf.SetLength(len);
211     ret = ToBufferEndian(env, &destbuf);
212   }
213   return ret;
214 }
215 
TranscodeUcs2FromUtf8(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)216 MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
217                                          const char* fromEncoding,
218                                          const char* toEncoding,
219                                          const char* source,
220                                          const size_t source_length,
221                                          UErrorCode* status) {
222   *status = U_ZERO_ERROR;
223   MaybeStackBuffer<UChar> destbuf;
224   int32_t result_length;
225   u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
226                 source, source_length, status);
227   MaybeLocal<Object> ret;
228   if (U_SUCCESS(*status)) {
229     destbuf.SetLength(result_length);
230     ret = ToBufferEndian(env, &destbuf);
231   } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
232     *status = U_ZERO_ERROR;
233     destbuf.AllocateSufficientStorage(result_length);
234     u_strFromUTF8(*destbuf, result_length, &result_length,
235                   source, source_length, status);
236     if (U_SUCCESS(*status)) {
237       destbuf.SetLength(result_length);
238       ret = ToBufferEndian(env, &destbuf);
239     }
240   }
241   return ret;
242 }
243 
TranscodeUtf8FromUcs2(Environment * env,const char * fromEncoding,const char * toEncoding,const char * source,const size_t source_length,UErrorCode * status)244 MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
245                                          const char* fromEncoding,
246                                          const char* toEncoding,
247                                          const char* source,
248                                          const size_t source_length,
249                                          UErrorCode* status) {
250   *status = U_ZERO_ERROR;
251   MaybeLocal<Object> ret;
252   const size_t length_in_chars = source_length / sizeof(UChar);
253   int32_t result_length;
254   MaybeStackBuffer<UChar> sourcebuf;
255   MaybeStackBuffer<char> destbuf;
256   CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
257   u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
258               *sourcebuf, length_in_chars, status);
259   if (U_SUCCESS(*status)) {
260     destbuf.SetLength(result_length);
261     ret = ToBufferEndian(env, &destbuf);
262   } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
263     *status = U_ZERO_ERROR;
264     destbuf.AllocateSufficientStorage(result_length);
265     u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
266                 length_in_chars, status);
267     if (U_SUCCESS(*status)) {
268       destbuf.SetLength(result_length);
269       ret = ToBufferEndian(env, &destbuf);
270     }
271   }
272   return ret;
273 }
274 
EncodingName(const enum encoding encoding)275 const char* EncodingName(const enum encoding encoding) {
276   switch (encoding) {
277     case ASCII: return "us-ascii";
278     case LATIN1: return "iso8859-1";
279     case UCS2: return "utf16le";
280     case UTF8: return "utf-8";
281     default: return nullptr;
282   }
283 }
284 
SupportedEncoding(const enum encoding encoding)285 bool SupportedEncoding(const enum encoding encoding) {
286   switch (encoding) {
287     case ASCII:
288     case LATIN1:
289     case UCS2:
290     case UTF8: return true;
291     default: return false;
292   }
293 }
294 
Transcode(const FunctionCallbackInfo<Value> & args)295 void Transcode(const FunctionCallbackInfo<Value>&args) {
296   Environment* env = Environment::GetCurrent(args);
297   Isolate* isolate = env->isolate();
298   UErrorCode status = U_ZERO_ERROR;
299   MaybeLocal<Object> result;
300 
301   ArrayBufferViewContents<char> input(args[0]);
302   const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
303   const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
304 
305   if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
306     TranscodeFunc tfn = &Transcode;
307     switch (fromEncoding) {
308       case ASCII:
309       case LATIN1:
310         if (toEncoding == UCS2)
311           tfn = &TranscodeToUcs2;
312         break;
313       case UTF8:
314         if (toEncoding == UCS2)
315           tfn = &TranscodeUcs2FromUtf8;
316         break;
317       case UCS2:
318         switch (toEncoding) {
319           case UCS2:
320             tfn = &Transcode;
321             break;
322           case UTF8:
323             tfn = &TranscodeUtf8FromUcs2;
324             break;
325           default:
326             tfn = &TranscodeFromUcs2;
327         }
328         break;
329       default:
330         // This should not happen because of the SupportedEncoding checks
331         ABORT();
332     }
333 
334     result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
335                  input.data(), input.length(), &status);
336   } else {
337     status = U_ILLEGAL_ARGUMENT_ERROR;
338   }
339 
340   if (result.IsEmpty())
341     return args.GetReturnValue().Set(status);
342 
343   return args.GetReturnValue().Set(result.ToLocalChecked());
344 }
345 
ICUErrorName(const FunctionCallbackInfo<Value> & args)346 void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
347   Environment* env = Environment::GetCurrent(args);
348   CHECK(args[0]->IsInt32());
349   UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
350   args.GetReturnValue().Set(
351       String::NewFromUtf8(env->isolate(),
352                           u_errorName(status)).ToLocalChecked());
353 }
354 
355 }  // anonymous namespace
356 
Converter(const char * name,const char * sub)357 Converter::Converter(const char* name, const char* sub) {
358   UErrorCode status = U_ZERO_ERROR;
359   UConverter* conv = ucnv_open(name, &status);
360   CHECK(U_SUCCESS(status));
361   conv_.reset(conv);
362   set_subst_chars(sub);
363 }
364 
Converter(UConverter * converter,const char * sub)365 Converter::Converter(UConverter* converter, const char* sub)
366     : conv_(converter) {
367   set_subst_chars(sub);
368 }
369 
set_subst_chars(const char * sub)370 void Converter::set_subst_chars(const char* sub) {
371   CHECK(conv_);
372   UErrorCode status = U_ZERO_ERROR;
373   if (sub != nullptr) {
374     ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
375     CHECK(U_SUCCESS(status));
376   }
377 }
378 
reset()379 void Converter::reset() {
380   ucnv_reset(conv_.get());
381 }
382 
min_char_size() const383 size_t Converter::min_char_size() const {
384   CHECK(conv_);
385   return ucnv_getMinCharSize(conv_.get());
386 }
387 
max_char_size() const388 size_t Converter::max_char_size() const {
389   CHECK(conv_);
390   return ucnv_getMaxCharSize(conv_.get());
391 }
392 
Has(const FunctionCallbackInfo<Value> & args)393 void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
394   Environment* env = Environment::GetCurrent(args);
395 
396   CHECK_GE(args.Length(), 1);
397   Utf8Value label(env->isolate(), args[0]);
398 
399   UErrorCode status = U_ZERO_ERROR;
400   ConverterPointer conv(ucnv_open(*label, &status));
401   args.GetReturnValue().Set(!!U_SUCCESS(status));
402 }
403 
Create(const FunctionCallbackInfo<Value> & args)404 void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
405   Environment* env = Environment::GetCurrent(args);
406 
407   Local<ObjectTemplate> t = env->i18n_converter_template();
408   Local<Object> obj;
409   if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
410 
411   CHECK_GE(args.Length(), 2);
412   Utf8Value label(env->isolate(), args[0]);
413   int flags = args[1]->Uint32Value(env->context()).ToChecked();
414   bool fatal =
415       (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
416 
417   UErrorCode status = U_ZERO_ERROR;
418   UConverter* conv = ucnv_open(*label, &status);
419   if (U_FAILURE(status))
420     return;
421 
422   if (fatal) {
423     status = U_ZERO_ERROR;
424     ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
425                         nullptr, nullptr, nullptr, &status);
426   }
427 
428   auto converter = new ConverterObject(env, obj, conv, flags);
429   size_t sublen = ucnv_getMinCharSize(conv);
430   std::string sub(sublen, '?');
431   converter->set_subst_chars(sub.c_str());
432 
433   args.GetReturnValue().Set(obj);
434 }
435 
Decode(const FunctionCallbackInfo<Value> & args)436 void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
437   Environment* env = Environment::GetCurrent(args);
438 
439   CHECK_GE(args.Length(), 4);  // Converter, Buffer, Flags, Encoding
440 
441   ConverterObject* converter;
442   ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
443 
444   if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() ||
445         args[1]->IsArrayBufferView())) {
446     return node::THROW_ERR_INVALID_ARG_TYPE(
447         env->isolate(),
448         "The \"input\" argument must be an instance of SharedArrayBuffer, "
449         "ArrayBuffer or ArrayBufferView.");
450   }
451 
452   ArrayBufferViewContents<char> input(args[1]);
453   int flags = args[2]->Uint32Value(env->context()).ToChecked();
454 
455   CHECK(args[3]->IsString());
456   Local<String> from_encoding = args[3].As<String>();
457 
458   UErrorCode status = U_ZERO_ERROR;
459   MaybeStackBuffer<UChar> result;
460 
461   UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
462 
463   // When flushing the final chunk, the limit is the maximum
464   // of either the input buffer length or the number of pending
465   // characters times the min char size, multiplied by 2 as unicode may
466   // take up to 2 UChars to encode a character
467   size_t limit = 2 * converter->min_char_size() *
468       (!flush ?
469           input.length() :
470           std::max(
471               input.length(),
472               static_cast<size_t>(
473                   ucnv_toUCountPending(converter->conv(), &status))));
474   status = U_ZERO_ERROR;
475 
476   if (limit > 0)
477     result.AllocateSufficientStorage(limit);
478 
479   auto cleanup = OnScopeLeave([&]() {
480     if (flush) {
481       // Reset the converter state.
482       converter->set_bom_seen(false);
483       converter->reset();
484     }
485   });
486 
487   const char* source = input.data();
488   size_t source_length = input.length();
489 
490   UChar* target = *result;
491   ucnv_toUnicode(converter->conv(),
492                  &target,
493                  target + limit,
494                  &source,
495                  source + source_length,
496                  nullptr,
497                  flush,
498                  &status);
499 
500   if (U_SUCCESS(status)) {
501     bool omit_initial_bom = false;
502     if (limit > 0) {
503       result.SetLength(target - &result[0]);
504       if (result.length() > 0 &&
505           converter->unicode() &&
506           !converter->ignore_bom() &&
507           !converter->bom_seen()) {
508         // If the very first result in the stream is a BOM, and we are not
509         // explicitly told to ignore it, then we mark it for discarding.
510         if (result[0] == 0xFEFF)
511           omit_initial_bom = true;
512         converter->set_bom_seen(true);
513       }
514     }
515 
516     Local<Value> error;
517     UChar* output = result.out();
518     size_t beginning = 0;
519     size_t length = result.length() * sizeof(UChar);
520 
521     if (omit_initial_bom) {
522       // Perform `ret = ret.slice(2)`.
523       beginning += 2;
524       length -= 2;
525     }
526 
527     char* value = reinterpret_cast<char*>(output) + beginning;
528 
529     if (IsBigEndian()) {
530       SwapBytes16(value, length);
531     }
532 
533     MaybeLocal<Value> encoded =
534         StringBytes::Encode(env->isolate(), value, length, UCS2, &error);
535 
536     Local<Value> ret;
537     if (encoded.ToLocal(&ret)) {
538       args.GetReturnValue().Set(ret);
539       return;
540     }
541   }
542 
543   node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
544       env->isolate(),
545       "The encoded data was not valid for encoding %s",
546       *node::Utf8Value(env->isolate(), from_encoding));
547 }
548 
ConverterObject(Environment * env,Local<Object> wrap,UConverter * converter,int flags,const char * sub)549 ConverterObject::ConverterObject(
550     Environment* env,
551     Local<Object> wrap,
552     UConverter* converter,
553     int flags,
554     const char* sub)
555     : BaseObject(env, wrap),
556       Converter(converter, sub),
557       flags_(flags) {
558   MakeWeak();
559 
560   switch (ucnv_getType(converter)) {
561     case UCNV_UTF8:
562     case UCNV_UTF16_BigEndian:
563     case UCNV_UTF16_LittleEndian:
564       flags_ |= CONVERTER_FLAGS_UNICODE;
565       break;
566     default: {
567       // Fall through
568     }
569   }
570 }
571 
572 
InitializeICUDirectory(const std::string & path)573 bool InitializeICUDirectory(const std::string& path) {
574   UErrorCode status = U_ZERO_ERROR;
575   if (path.empty()) {
576 #ifdef NODE_HAVE_SMALL_ICU
577     // install the 'small' data.
578     udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
579 #else  // !NODE_HAVE_SMALL_ICU
580     // no small data, so nothing to do.
581 #endif  // !NODE_HAVE_SMALL_ICU
582   } else {
583     u_setDataDirectory(path.c_str());
584     u_init(&status);
585   }
586   return status == U_ZERO_ERROR;
587 }
588 
SetDefaultTimeZone(const char * tzid)589 void SetDefaultTimeZone(const char* tzid) {
590   size_t tzidlen = strlen(tzid) + 1;
591   UErrorCode status = U_ZERO_ERROR;
592   MaybeStackBuffer<UChar, 256> id(tzidlen);
593   u_charsToUChars(tzid, id.out(), tzidlen);
594   // This is threadsafe:
595   ucal_setDefaultTimeZone(id.out(), &status);
596   CHECK(U_SUCCESS(status));
597 }
598 
ToUnicode(MaybeStackBuffer<char> * buf,const char * input,size_t length)599 int32_t ToUnicode(MaybeStackBuffer<char>* buf,
600                   const char* input,
601                   size_t length) {
602   UErrorCode status = U_ZERO_ERROR;
603   uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
604   UIDNA* uidna = uidna_openUTS46(options, &status);
605   if (U_FAILURE(status))
606     return -1;
607   UIDNAInfo info = UIDNA_INFO_INITIALIZER;
608 
609   int32_t len = uidna_nameToUnicodeUTF8(uidna,
610                                         input, length,
611                                         **buf, buf->capacity(),
612                                         &info,
613                                         &status);
614 
615   // Do not check info.errors like we do with ToASCII since ToUnicode always
616   // returns a string, despite any possible errors that may have occurred.
617 
618   if (status == U_BUFFER_OVERFLOW_ERROR) {
619     status = U_ZERO_ERROR;
620     buf->AllocateSufficientStorage(len);
621     len = uidna_nameToUnicodeUTF8(uidna,
622                                   input, length,
623                                   **buf, buf->capacity(),
624                                   &info,
625                                   &status);
626   }
627 
628   // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
629   // string, regardless of whether an error occurred.
630 
631   if (U_FAILURE(status)) {
632     len = -1;
633     buf->SetLength(0);
634   } else {
635     buf->SetLength(len);
636   }
637 
638   uidna_close(uidna);
639   return len;
640 }
641 
ToASCII(MaybeStackBuffer<char> * buf,const char * input,size_t length,idna_mode mode)642 int32_t ToASCII(MaybeStackBuffer<char>* buf,
643                 const char* input,
644                 size_t length,
645                 idna_mode mode) {
646   UErrorCode status = U_ZERO_ERROR;
647   uint32_t options =                  // CheckHyphens = false; handled later
648     UIDNA_CHECK_BIDI |                // CheckBidi = true
649     UIDNA_CHECK_CONTEXTJ |            // CheckJoiners = true
650     UIDNA_NONTRANSITIONAL_TO_ASCII;   // Nontransitional_Processing
651   if (mode == idna_mode::kStrict) {
652     options |= UIDNA_USE_STD3_RULES;  // UseSTD3ASCIIRules = beStrict
653                                       // VerifyDnsLength = beStrict;
654                                       //   handled later
655   }
656 
657   UIDNA* uidna = uidna_openUTS46(options, &status);
658   if (U_FAILURE(status))
659     return -1;
660   UIDNAInfo info = UIDNA_INFO_INITIALIZER;
661 
662   int32_t len = uidna_nameToASCII_UTF8(uidna,
663                                        input, length,
664                                        **buf, buf->capacity(),
665                                        &info,
666                                        &status);
667 
668   if (status == U_BUFFER_OVERFLOW_ERROR) {
669     status = U_ZERO_ERROR;
670     buf->AllocateSufficientStorage(len);
671     len = uidna_nameToASCII_UTF8(uidna,
672                                  input, length,
673                                  **buf, buf->capacity(),
674                                  &info,
675                                  &status);
676   }
677 
678   // In UTS #46 which specifies ToASCII, certain error conditions are
679   // configurable through options, and the WHATWG URL Standard promptly elects
680   // to disable some of them to accommodate for real-world use cases.
681   // Unfortunately, ICU4C's IDNA module does not support disabling some of
682   // these options through `options` above, and thus continues throwing
683   // unnecessary errors. To counter this situation, we just filter out the
684   // errors that may have happened afterwards, before deciding whether to
685   // return an error from this function.
686 
687   // CheckHyphens = false
688   // (Specified in the current UTS #46 draft rev. 18.)
689   // Refs:
690   // - https://github.com/whatwg/url/issues/53
691   // - https://github.com/whatwg/url/pull/309
692   // - http://www.unicode.org/review/pri317/
693   // - http://www.unicode.org/reports/tr46/tr46-18.html
694   // - https://www.icann.org/news/announcement-2000-01-07-en
695   info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
696   info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
697   info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
698 
699   if (mode != idna_mode::kStrict) {
700     // VerifyDnsLength = beStrict
701     info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
702     info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
703     info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
704   }
705 
706   if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) {
707     len = -1;
708     buf->SetLength(0);
709   } else {
710     buf->SetLength(len);
711   }
712 
713   uidna_close(uidna);
714   return len;
715 }
716 
ToUnicode(const FunctionCallbackInfo<Value> & args)717 static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
718   Environment* env = Environment::GetCurrent(args);
719   CHECK_GE(args.Length(), 1);
720   CHECK(args[0]->IsString());
721   Utf8Value val(env->isolate(), args[0]);
722 
723   MaybeStackBuffer<char> buf;
724   int32_t len = ToUnicode(&buf, *val, val.length());
725 
726   if (len < 0) {
727     return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
728   }
729 
730   args.GetReturnValue().Set(
731       String::NewFromUtf8(env->isolate(),
732                           *buf,
733                           NewStringType::kNormal,
734                           len).ToLocalChecked());
735 }
736 
ToASCII(const FunctionCallbackInfo<Value> & args)737 static void ToASCII(const FunctionCallbackInfo<Value>& args) {
738   Environment* env = Environment::GetCurrent(args);
739   CHECK_GE(args.Length(), 1);
740   CHECK(args[0]->IsString());
741   Utf8Value val(env->isolate(), args[0]);
742   // optional arg
743   bool lenient = args[1]->BooleanValue(env->isolate());
744   idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault;
745 
746   MaybeStackBuffer<char> buf;
747   int32_t len = ToASCII(&buf, *val, val.length(), mode);
748 
749   if (len < 0) {
750     return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
751   }
752 
753   args.GetReturnValue().Set(
754       String::NewFromUtf8(env->isolate(),
755                           *buf,
756                           NewStringType::kNormal,
757                           len).ToLocalChecked());
758 }
759 
760 // This is similar to wcwidth except that it takes the current unicode
761 // character properties database into consideration, allowing it to
762 // correctly calculate the column widths of things like emoji's and
763 // newer wide characters. wcwidth, on the other hand, uses a fixed
764 // algorithm that does not take things like emoji into proper
765 // consideration.
766 //
767 // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
768 // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
769 // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
770 // allow it to be input. Linux's PTY terminal prints control characters as
771 // Narrow rhombi.
772 //
773 // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
774 // consonants are 0-width when combined with initial consonants; otherwise they
775 // are technically Wide. But many terminals (including Konsole and
776 // VTE/GLib-based) implement all medials and finals as 0-width.
777 //
778 // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
779 // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
780 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
GetColumnWidth(UChar32 codepoint,bool ambiguous_as_full_width=false)781 static int GetColumnWidth(UChar32 codepoint,
782                           bool ambiguous_as_full_width = false) {
783   // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
784   // codepoint as being full width, wide, ambiguous, neutral, narrow,
785   // or halfwidth.
786   const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
787   switch (eaw) {
788     case U_EA_FULLWIDTH:
789     case U_EA_WIDE:
790       return 2;
791     case U_EA_AMBIGUOUS:
792       // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
793       if (ambiguous_as_full_width) {
794         return 2;
795       }
796       // If ambiguous_as_full_width is false:
797       [[fallthrough]];
798     case U_EA_NEUTRAL:
799       if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
800         return 2;
801       }
802       [[fallthrough]];
803     case U_EA_HALFWIDTH:
804     case U_EA_NARROW:
805     default:
806       const auto zero_width_mask = U_GC_CC_MASK |  // C0/C1 control code
807                                   U_GC_CF_MASK |  // Format control character
808                                   U_GC_ME_MASK |  // Enclosing mark
809                                   U_GC_MN_MASK;   // Nonspacing mark
810       if (codepoint != 0x00AD &&  // SOFT HYPHEN is Cf but not zero-width
811           ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
812           u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
813         return 0;
814       }
815       return 1;
816   }
817 }
818 
819 // Returns the column width for the given String.
GetStringWidth(const FunctionCallbackInfo<Value> & args)820 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
821   Environment* env = Environment::GetCurrent(args);
822   CHECK(args[0]->IsString());
823 
824   bool ambiguous_as_full_width = args[1]->IsTrue();
825   bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
826 
827   TwoByteValue value(env->isolate(), args[0]);
828   // reinterpret_cast is required by windows to compile
829   UChar* str = reinterpret_cast<UChar*>(*value);
830   static_assert(sizeof(*str) == sizeof(**value),
831                 "sizeof(*str) == sizeof(**value)");
832   UChar32 c = 0;
833   UChar32 p;
834   size_t n = 0;
835   uint32_t width = 0;
836 
837   while (n < value.length()) {
838     p = c;
839     U16_NEXT(str, n, value.length(), c);
840     // Don't count individual emoji codepoints that occur within an
841     // emoji sequence. This is not necessarily foolproof. Some
842     // environments display emoji sequences in the appropriate
843     // condensed form (as a single emoji glyph), other environments
844     // may not understand an emoji sequence and will display each
845     // individual emoji separately. When this happens, the width
846     // calculated will be off, and there's no reliable way of knowing
847     // in advance if a particular sequence is going to be supported.
848     // The expand_emoji_sequence option allows the caller to skip this
849     // check and count each code within an emoji sequence separately.
850     // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
851     if (!expand_emoji_sequence &&
852         n > 0 && p == 0x200d &&  // 0x200d == ZWJ (zero width joiner)
853         (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
854          u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
855       continue;
856     }
857     width += GetColumnWidth(c, ambiguous_as_full_width);
858   }
859   args.GetReturnValue().Set(width);
860 }
861 
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)862 void Initialize(Local<Object> target,
863                 Local<Value> unused,
864                 Local<Context> context,
865                 void* priv) {
866   Environment* env = Environment::GetCurrent(context);
867   SetMethod(context, target, "toUnicode", ToUnicode);
868   SetMethod(context, target, "toASCII", ToASCII);
869   SetMethod(context, target, "getStringWidth", GetStringWidth);
870 
871   // One-shot converters
872   SetMethod(context, target, "icuErrName", ICUErrorName);
873   SetMethod(context, target, "transcode", Transcode);
874 
875   // ConverterObject
876   {
877     Local<FunctionTemplate> t = NewFunctionTemplate(env->isolate(), nullptr);
878     t->Inherit(BaseObject::GetConstructorTemplate(env));
879     t->InstanceTemplate()->SetInternalFieldCount(
880         ConverterObject::kInternalFieldCount);
881     Local<String> converter_string =
882         FIXED_ONE_BYTE_STRING(env->isolate(), "Converter");
883     t->SetClassName(converter_string);
884     env->set_i18n_converter_template(t->InstanceTemplate());
885   }
886 
887   SetMethod(context, target, "getConverter", ConverterObject::Create);
888   SetMethod(context, target, "decode", ConverterObject::Decode);
889   SetMethod(context, target, "hasConverter", ConverterObject::Has);
890 }
891 
RegisterExternalReferences(ExternalReferenceRegistry * registry)892 void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
893   registry->Register(ToUnicode);
894   registry->Register(ToASCII);
895   registry->Register(GetStringWidth);
896   registry->Register(ICUErrorName);
897   registry->Register(Transcode);
898   registry->Register(ConverterObject::Create);
899   registry->Register(ConverterObject::Decode);
900   registry->Register(ConverterObject::Has);
901 }
902 
903 }  // namespace i18n
904 }  // namespace node
905 
906 NODE_BINDING_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize)
907 NODE_BINDING_EXTERNAL_REFERENCE(icu, node::i18n::RegisterExternalReferences)
908 
909 #endif  // NODE_HAVE_I18N_SUPPORT
910