• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1'use strict';
2
3// An implementation of the WHATWG Encoding Standard
4// https://encoding.spec.whatwg.org
5
6const {
7  ObjectCreate,
8  ObjectDefineProperties,
9  ObjectGetOwnPropertyDescriptors,
10  SafeMap,
11  StringPrototypeSlice,
12  Symbol,
13  SymbolToStringTag,
14  Uint32Array,
15  Uint8Array,
16} = primordials;
17
18const {
19  ERR_ENCODING_INVALID_ENCODED_DATA,
20  ERR_ENCODING_NOT_SUPPORTED,
21  ERR_INVALID_ARG_TYPE,
22  ERR_INVALID_THIS,
23  ERR_NO_ICU
24} = require('internal/errors').codes;
25const kHandle = Symbol('handle');
26const kFlags = Symbol('flags');
27const kEncoding = Symbol('encoding');
28const kDecoder = Symbol('decoder');
29const kEncoder = Symbol('encoder');
30
31const {
32  getConstructorOf,
33  customInspectSymbol: inspect
34} = require('internal/util');
35
36const {
37  isAnyArrayBuffer,
38  isArrayBufferView,
39  isUint8Array
40} = require('internal/util/types');
41
42const {
43  validateString,
44  validateObject,
45} = require('internal/validators');
46
47const {
48  encodeInto,
49  encodeUtf8String
50} = internalBinding('buffer');
51
52let Buffer;
53function lazyBuffer() {
54  if (Buffer === undefined)
55    Buffer = require('buffer').Buffer;
56  return Buffer;
57}
58
59function validateEncoder(obj) {
60  if (obj == null || obj[kEncoder] !== true)
61    throw new ERR_INVALID_THIS('TextEncoder');
62}
63
64function validateDecoder(obj) {
65  if (obj == null || obj[kDecoder] !== true)
66    throw new ERR_INVALID_THIS('TextDecoder');
67}
68
69const CONVERTER_FLAGS_FLUSH = 0x1;
70const CONVERTER_FLAGS_FATAL = 0x2;
71const CONVERTER_FLAGS_IGNORE_BOM = 0x4;
72
73const empty = new Uint8Array(0);
74
75const encodings = new SafeMap([
76  ['unicode-1-1-utf-8', 'utf-8'],
77  ['utf8', 'utf-8'],
78  ['utf-8', 'utf-8'],
79  ['866', 'ibm866'],
80  ['cp866', 'ibm866'],
81  ['csibm866', 'ibm866'],
82  ['ibm866', 'ibm866'],
83  ['csisolatin2', 'iso-8859-2'],
84  ['iso-8859-2', 'iso-8859-2'],
85  ['iso-ir-101', 'iso-8859-2'],
86  ['iso8859-2', 'iso-8859-2'],
87  ['iso88592', 'iso-8859-2'],
88  ['iso_8859-2', 'iso-8859-2'],
89  ['iso_8859-2:1987', 'iso-8859-2'],
90  ['l2', 'iso-8859-2'],
91  ['latin2', 'iso-8859-2'],
92  ['csisolatin3', 'iso-8859-3'],
93  ['iso-8859-3', 'iso-8859-3'],
94  ['iso-ir-109', 'iso-8859-3'],
95  ['iso8859-3', 'iso-8859-3'],
96  ['iso88593', 'iso-8859-3'],
97  ['iso_8859-3', 'iso-8859-3'],
98  ['iso_8859-3:1988', 'iso-8859-3'],
99  ['l3', 'iso-8859-3'],
100  ['latin3', 'iso-8859-3'],
101  ['csisolatin4', 'iso-8859-4'],
102  ['iso-8859-4', 'iso-8859-4'],
103  ['iso-ir-110', 'iso-8859-4'],
104  ['iso8859-4', 'iso-8859-4'],
105  ['iso88594', 'iso-8859-4'],
106  ['iso_8859-4', 'iso-8859-4'],
107  ['iso_8859-4:1988', 'iso-8859-4'],
108  ['l4', 'iso-8859-4'],
109  ['latin4', 'iso-8859-4'],
110  ['csisolatincyrillic', 'iso-8859-5'],
111  ['cyrillic', 'iso-8859-5'],
112  ['iso-8859-5', 'iso-8859-5'],
113  ['iso-ir-144', 'iso-8859-5'],
114  ['iso8859-5', 'iso-8859-5'],
115  ['iso88595', 'iso-8859-5'],
116  ['iso_8859-5', 'iso-8859-5'],
117  ['iso_8859-5:1988', 'iso-8859-5'],
118  ['arabic', 'iso-8859-6'],
119  ['asmo-708', 'iso-8859-6'],
120  ['csiso88596e', 'iso-8859-6'],
121  ['csiso88596i', 'iso-8859-6'],
122  ['csisolatinarabic', 'iso-8859-6'],
123  ['ecma-114', 'iso-8859-6'],
124  ['iso-8859-6', 'iso-8859-6'],
125  ['iso-8859-6-e', 'iso-8859-6'],
126  ['iso-8859-6-i', 'iso-8859-6'],
127  ['iso-ir-127', 'iso-8859-6'],
128  ['iso8859-6', 'iso-8859-6'],
129  ['iso88596', 'iso-8859-6'],
130  ['iso_8859-6', 'iso-8859-6'],
131  ['iso_8859-6:1987', 'iso-8859-6'],
132  ['csisolatingreek', 'iso-8859-7'],
133  ['ecma-118', 'iso-8859-7'],
134  ['elot_928', 'iso-8859-7'],
135  ['greek', 'iso-8859-7'],
136  ['greek8', 'iso-8859-7'],
137  ['iso-8859-7', 'iso-8859-7'],
138  ['iso-ir-126', 'iso-8859-7'],
139  ['iso8859-7', 'iso-8859-7'],
140  ['iso88597', 'iso-8859-7'],
141  ['iso_8859-7', 'iso-8859-7'],
142  ['iso_8859-7:1987', 'iso-8859-7'],
143  ['sun_eu_greek', 'iso-8859-7'],
144  ['csiso88598e', 'iso-8859-8'],
145  ['csisolatinhebrew', 'iso-8859-8'],
146  ['hebrew', 'iso-8859-8'],
147  ['iso-8859-8', 'iso-8859-8'],
148  ['iso-8859-8-e', 'iso-8859-8'],
149  ['iso-ir-138', 'iso-8859-8'],
150  ['iso8859-8', 'iso-8859-8'],
151  ['iso88598', 'iso-8859-8'],
152  ['iso_8859-8', 'iso-8859-8'],
153  ['iso_8859-8:1988', 'iso-8859-8'],
154  ['visual', 'iso-8859-8'],
155  ['csiso88598i', 'iso-8859-8-i'],
156  ['iso-8859-8-i', 'iso-8859-8-i'],
157  ['logical', 'iso-8859-8-i'],
158  ['csisolatin6', 'iso-8859-10'],
159  ['iso-8859-10', 'iso-8859-10'],
160  ['iso-ir-157', 'iso-8859-10'],
161  ['iso8859-10', 'iso-8859-10'],
162  ['iso885910', 'iso-8859-10'],
163  ['l6', 'iso-8859-10'],
164  ['latin6', 'iso-8859-10'],
165  ['iso-8859-13', 'iso-8859-13'],
166  ['iso8859-13', 'iso-8859-13'],
167  ['iso885913', 'iso-8859-13'],
168  ['iso-8859-14', 'iso-8859-14'],
169  ['iso8859-14', 'iso-8859-14'],
170  ['iso885914', 'iso-8859-14'],
171  ['csisolatin9', 'iso-8859-15'],
172  ['iso-8859-15', 'iso-8859-15'],
173  ['iso8859-15', 'iso-8859-15'],
174  ['iso885915', 'iso-8859-15'],
175  ['iso_8859-15', 'iso-8859-15'],
176  ['l9', 'iso-8859-15'],
177  ['cskoi8r', 'koi8-r'],
178  ['koi', 'koi8-r'],
179  ['koi8', 'koi8-r'],
180  ['koi8-r', 'koi8-r'],
181  ['koi8_r', 'koi8-r'],
182  ['koi8-ru', 'koi8-u'],
183  ['koi8-u', 'koi8-u'],
184  ['csmacintosh', 'macintosh'],
185  ['mac', 'macintosh'],
186  ['macintosh', 'macintosh'],
187  ['x-mac-roman', 'macintosh'],
188  ['dos-874', 'windows-874'],
189  ['iso-8859-11', 'windows-874'],
190  ['iso8859-11', 'windows-874'],
191  ['iso885911', 'windows-874'],
192  ['tis-620', 'windows-874'],
193  ['windows-874', 'windows-874'],
194  ['cp1250', 'windows-1250'],
195  ['windows-1250', 'windows-1250'],
196  ['x-cp1250', 'windows-1250'],
197  ['cp1251', 'windows-1251'],
198  ['windows-1251', 'windows-1251'],
199  ['x-cp1251', 'windows-1251'],
200  ['ansi_x3.4-1968', 'windows-1252'],
201  ['ascii', 'windows-1252'],
202  ['cp1252', 'windows-1252'],
203  ['cp819', 'windows-1252'],
204  ['csisolatin1', 'windows-1252'],
205  ['ibm819', 'windows-1252'],
206  ['iso-8859-1', 'windows-1252'],
207  ['iso-ir-100', 'windows-1252'],
208  ['iso8859-1', 'windows-1252'],
209  ['iso88591', 'windows-1252'],
210  ['iso_8859-1', 'windows-1252'],
211  ['iso_8859-1:1987', 'windows-1252'],
212  ['l1', 'windows-1252'],
213  ['latin1', 'windows-1252'],
214  ['us-ascii', 'windows-1252'],
215  ['windows-1252', 'windows-1252'],
216  ['x-cp1252', 'windows-1252'],
217  ['cp1253', 'windows-1253'],
218  ['windows-1253', 'windows-1253'],
219  ['x-cp1253', 'windows-1253'],
220  ['cp1254', 'windows-1254'],
221  ['csisolatin5', 'windows-1254'],
222  ['iso-8859-9', 'windows-1254'],
223  ['iso-ir-148', 'windows-1254'],
224  ['iso8859-9', 'windows-1254'],
225  ['iso88599', 'windows-1254'],
226  ['iso_8859-9', 'windows-1254'],
227  ['iso_8859-9:1989', 'windows-1254'],
228  ['l5', 'windows-1254'],
229  ['latin5', 'windows-1254'],
230  ['windows-1254', 'windows-1254'],
231  ['x-cp1254', 'windows-1254'],
232  ['cp1255', 'windows-1255'],
233  ['windows-1255', 'windows-1255'],
234  ['x-cp1255', 'windows-1255'],
235  ['cp1256', 'windows-1256'],
236  ['windows-1256', 'windows-1256'],
237  ['x-cp1256', 'windows-1256'],
238  ['cp1257', 'windows-1257'],
239  ['windows-1257', 'windows-1257'],
240  ['x-cp1257', 'windows-1257'],
241  ['cp1258', 'windows-1258'],
242  ['windows-1258', 'windows-1258'],
243  ['x-cp1258', 'windows-1258'],
244  ['x-mac-cyrillic', 'x-mac-cyrillic'],
245  ['x-mac-ukrainian', 'x-mac-cyrillic'],
246  ['chinese', 'gbk'],
247  ['csgb2312', 'gbk'],
248  ['csiso58gb231280', 'gbk'],
249  ['gb2312', 'gbk'],
250  ['gb_2312', 'gbk'],
251  ['gb_2312-80', 'gbk'],
252  ['gbk', 'gbk'],
253  ['iso-ir-58', 'gbk'],
254  ['x-gbk', 'gbk'],
255  ['gb18030', 'gb18030'],
256  ['big5', 'big5'],
257  ['big5-hkscs', 'big5'],
258  ['cn-big5', 'big5'],
259  ['csbig5', 'big5'],
260  ['x-x-big5', 'big5'],
261  ['cseucpkdfmtjapanese', 'euc-jp'],
262  ['euc-jp', 'euc-jp'],
263  ['x-euc-jp', 'euc-jp'],
264  ['csiso2022jp', 'iso-2022-jp'],
265  ['iso-2022-jp', 'iso-2022-jp'],
266  ['csshiftjis', 'shift_jis'],
267  ['ms932', 'shift_jis'],
268  ['ms_kanji', 'shift_jis'],
269  ['shift-jis', 'shift_jis'],
270  ['shift_jis', 'shift_jis'],
271  ['sjis', 'shift_jis'],
272  ['windows-31j', 'shift_jis'],
273  ['x-sjis', 'shift_jis'],
274  ['cseuckr', 'euc-kr'],
275  ['csksc56011987', 'euc-kr'],
276  ['euc-kr', 'euc-kr'],
277  ['iso-ir-149', 'euc-kr'],
278  ['korean', 'euc-kr'],
279  ['ks_c_5601-1987', 'euc-kr'],
280  ['ks_c_5601-1989', 'euc-kr'],
281  ['ksc5601', 'euc-kr'],
282  ['ksc_5601', 'euc-kr'],
283  ['windows-949', 'euc-kr'],
284  ['utf-16be', 'utf-16be'],
285  ['utf-16le', 'utf-16le'],
286  ['utf-16', 'utf-16le'],
287]);
288
289// Unfortunately, String.prototype.trim also removes non-ascii whitespace,
290// so we have to do this manually
291function trimAsciiWhitespace(label) {
292  let s = 0;
293  let e = label.length;
294  while (s < e && (
295    label[s] === '\u0009' ||
296    label[s] === '\u000a' ||
297    label[s] === '\u000c' ||
298    label[s] === '\u000d' ||
299    label[s] === '\u0020')) {
300    s++;
301  }
302  while (e > s && (
303    label[e - 1] === '\u0009' ||
304    label[e - 1] === '\u000a' ||
305    label[e - 1] === '\u000c' ||
306    label[e - 1] === '\u000d' ||
307    label[e - 1] === '\u0020')) {
308    e--;
309  }
310  return StringPrototypeSlice(label, s, e);
311}
312
313function getEncodingFromLabel(label) {
314  const enc = encodings.get(label);
315  if (enc !== undefined) return enc;
316  return encodings.get(trimAsciiWhitespace(label.toLowerCase()));
317}
318
319const encodeIntoResults = new Uint32Array(2);
320
321class TextEncoder {
322  constructor() {
323    this[kEncoder] = true;
324  }
325
326  get encoding() {
327    validateEncoder(this);
328    return 'utf-8';
329  }
330
331  encode(input = '') {
332    validateEncoder(this);
333    return encodeUtf8String(`${input}`);
334  }
335
336  encodeInto(src, dest) {
337    validateEncoder(this);
338    validateString(src, 'src');
339    if (!dest || !isUint8Array(dest))
340      throw new ERR_INVALID_ARG_TYPE('dest', 'Uint8Array', dest);
341    encodeInto(src, dest, encodeIntoResults);
342    return { read: encodeIntoResults[0], written: encodeIntoResults[1] };
343  }
344
345  [inspect](depth, opts) {
346    validateEncoder(this);
347    if (typeof depth === 'number' && depth < 0)
348      return this;
349    const ctor = getConstructorOf(this);
350    const obj = ObjectCreate({
351      constructor: ctor === null ? TextEncoder : ctor
352    });
353    obj.encoding = this.encoding;
354    // Lazy to avoid circular dependency
355    return require('internal/util/inspect').inspect(obj, opts);
356  }
357}
358
359ObjectDefineProperties(
360  TextEncoder.prototype, {
361    'encode': { enumerable: true },
362    'encodeInto': { enumerable: true },
363    'encoding': { enumerable: true },
364    [SymbolToStringTag]: { configurable: true, value: 'TextEncoder' },
365  });
366
367const TextDecoder =
368  internalBinding('config').hasIntl ?
369    makeTextDecoderICU() :
370    makeTextDecoderJS();
371
372function makeTextDecoderICU() {
373  const {
374    decode: _decode,
375    getConverter,
376  } = internalBinding('icu');
377
378  class TextDecoder {
379    constructor(encoding = 'utf-8', options = {}) {
380      encoding = `${encoding}`;
381      validateObject(options, 'options', {
382        nullable: true,
383        allowArray: true,
384        allowFunction: true,
385      });
386
387      const enc = getEncodingFromLabel(encoding);
388      if (enc === undefined)
389        throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
390
391      let flags = 0;
392      if (options !== null) {
393        flags |= options.fatal ? CONVERTER_FLAGS_FATAL : 0;
394        flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0;
395      }
396
397      const handle = getConverter(enc, flags);
398      if (handle === undefined)
399        throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
400
401      this[kDecoder] = true;
402      this[kHandle] = handle;
403      this[kFlags] = flags;
404      this[kEncoding] = enc;
405    }
406
407
408    decode(input = empty, options = {}) {
409      validateDecoder(this);
410      if (isAnyArrayBuffer(input)) {
411        input = lazyBuffer().from(input);
412      } else if (!isArrayBufferView(input)) {
413        throw new ERR_INVALID_ARG_TYPE('input',
414                                       ['ArrayBuffer', 'ArrayBufferView'],
415                                       input);
416      }
417      validateObject(options, 'options', {
418        nullable: true,
419        allowArray: true,
420        allowFunction: true,
421      });
422
423      let flags = 0;
424      if (options !== null)
425        flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH;
426
427      const ret = _decode(this[kHandle], input, flags);
428      if (typeof ret === 'number') {
429        throw new ERR_ENCODING_INVALID_ENCODED_DATA(this.encoding, ret);
430      }
431      return ret.toString('ucs2');
432    }
433  }
434
435  return TextDecoder;
436}
437
438function makeTextDecoderJS() {
439  let StringDecoder;
440  function lazyStringDecoder() {
441    if (StringDecoder === undefined)
442      ({ StringDecoder } = require('string_decoder'));
443    return StringDecoder;
444  }
445
446  const kBOMSeen = Symbol('BOM seen');
447
448  function hasConverter(encoding) {
449    return encoding === 'utf-8' || encoding === 'utf-16le';
450  }
451
452  class TextDecoder {
453    constructor(encoding = 'utf-8', options = {}) {
454      encoding = `${encoding}`;
455      validateObject(options, 'options', {
456        nullable: true,
457        allowArray: true,
458        allowFunction: true,
459      });
460
461      const enc = getEncodingFromLabel(encoding);
462      if (enc === undefined || !hasConverter(enc))
463        throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
464
465      let flags = 0;
466      if (options !== null) {
467        if (options.fatal) {
468          throw new ERR_NO_ICU('"fatal" option');
469        }
470        flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0;
471      }
472
473      this[kDecoder] = true;
474      // StringDecoder will normalize WHATWG encoding to Node.js encoding.
475      this[kHandle] = new (lazyStringDecoder())(enc);
476      this[kFlags] = flags;
477      this[kEncoding] = enc;
478      this[kBOMSeen] = false;
479    }
480
481    decode(input = empty, options = {}) {
482      validateDecoder(this);
483      if (isAnyArrayBuffer(input)) {
484        input = lazyBuffer().from(input);
485      } else if (isArrayBufferView(input)) {
486        input = lazyBuffer().from(input.buffer, input.byteOffset,
487                                  input.byteLength);
488      } else {
489        throw new ERR_INVALID_ARG_TYPE('input',
490                                       ['ArrayBuffer', 'ArrayBufferView'],
491                                       input);
492      }
493      validateObject(options, 'options', {
494        nullable: true,
495        allowArray: true,
496        allowFunction: true,
497      });
498
499      if (this[kFlags] & CONVERTER_FLAGS_FLUSH) {
500        this[kBOMSeen] = false;
501      }
502
503      if (options !== null && options.stream) {
504        this[kFlags] &= ~CONVERTER_FLAGS_FLUSH;
505      } else {
506        this[kFlags] |= CONVERTER_FLAGS_FLUSH;
507      }
508
509      let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ?
510        this[kHandle].end(input) :
511        this[kHandle].write(input);
512
513      if (result.length > 0 &&
514          !this[kBOMSeen] &&
515          !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
516        // If the very first result in the stream is a BOM, and we are not
517        // explicitly told to ignore it, then we discard it.
518        if (result[0] === '\ufeff') {
519          result = StringPrototypeSlice(result, 1);
520        }
521        this[kBOMSeen] = true;
522      }
523
524      return result;
525    }
526  }
527
528  return TextDecoder;
529}
530
531// Mix in some shared properties.
532ObjectDefineProperties(
533  TextDecoder.prototype,
534  ObjectGetOwnPropertyDescriptors({
535    get encoding() {
536      validateDecoder(this);
537      return this[kEncoding];
538    },
539
540    get fatal() {
541      validateDecoder(this);
542      return (this[kFlags] & CONVERTER_FLAGS_FATAL) === CONVERTER_FLAGS_FATAL;
543    },
544
545    get ignoreBOM() {
546      validateDecoder(this);
547      return (this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM) ===
548              CONVERTER_FLAGS_IGNORE_BOM;
549    },
550
551    [inspect](depth, opts) {
552      validateDecoder(this);
553      if (typeof depth === 'number' && depth < 0)
554        return this;
555      const constructor = getConstructorOf(this) || TextDecoder;
556      const obj = ObjectCreate({ constructor });
557      obj.encoding = this.encoding;
558      obj.fatal = this.fatal;
559      obj.ignoreBOM = this.ignoreBOM;
560      if (opts.showHidden) {
561        obj[kFlags] = this[kFlags];
562        obj[kHandle] = this[kHandle];
563      }
564      // Lazy to avoid circular dependency
565      const { inspect } = require('internal/util/inspect');
566      return `${constructor.name} ${inspect(obj)}`;
567    }
568  })
569);
570
571ObjectDefineProperties(TextDecoder.prototype, {
572  decode: { enumerable: true },
573  [inspect]: { enumerable: false },
574  [SymbolToStringTag]: {
575    configurable: true,
576    value: 'TextDecoder'
577  }
578});
579
580module.exports = {
581  getEncodingFromLabel,
582  TextDecoder,
583  TextEncoder
584};
585