1'use strict'; 2 3// An implementation of the WHATWG Encoding Standard 4// https://encoding.spec.whatwg.org 5 6const { 7 ObjectCreate, 8 ObjectDefineProperties, 9 ObjectGetOwnPropertyDescriptors, 10 SafeMap, 11 StringPrototypeSlice, 12 Symbol, 13 SymbolToStringTag, 14 Uint32Array, 15 Uint8Array, 16} = primordials; 17 18const { 19 ERR_ENCODING_INVALID_ENCODED_DATA, 20 ERR_ENCODING_NOT_SUPPORTED, 21 ERR_INVALID_ARG_TYPE, 22 ERR_INVALID_THIS, 23 ERR_NO_ICU 24} = require('internal/errors').codes; 25const kHandle = Symbol('handle'); 26const kFlags = Symbol('flags'); 27const kEncoding = Symbol('encoding'); 28const kDecoder = Symbol('decoder'); 29const kEncoder = Symbol('encoder'); 30 31const { 32 getConstructorOf, 33 customInspectSymbol: inspect 34} = require('internal/util'); 35 36const { 37 isAnyArrayBuffer, 38 isArrayBufferView, 39 isUint8Array 40} = require('internal/util/types'); 41 42const { 43 validateString, 44 validateObject, 45} = require('internal/validators'); 46 47const { 48 encodeInto, 49 encodeUtf8String 50} = internalBinding('buffer'); 51 52let Buffer; 53function lazyBuffer() { 54 if (Buffer === undefined) 55 Buffer = require('buffer').Buffer; 56 return Buffer; 57} 58 59function validateEncoder(obj) { 60 if (obj == null || obj[kEncoder] !== true) 61 throw new ERR_INVALID_THIS('TextEncoder'); 62} 63 64function validateDecoder(obj) { 65 if (obj == null || obj[kDecoder] !== true) 66 throw new ERR_INVALID_THIS('TextDecoder'); 67} 68 69const CONVERTER_FLAGS_FLUSH = 0x1; 70const CONVERTER_FLAGS_FATAL = 0x2; 71const CONVERTER_FLAGS_IGNORE_BOM = 0x4; 72 73const empty = new Uint8Array(0); 74 75const encodings = new SafeMap([ 76 ['unicode-1-1-utf-8', 'utf-8'], 77 ['utf8', 'utf-8'], 78 ['utf-8', 'utf-8'], 79 ['866', 'ibm866'], 80 ['cp866', 'ibm866'], 81 ['csibm866', 'ibm866'], 82 ['ibm866', 'ibm866'], 83 ['csisolatin2', 'iso-8859-2'], 84 ['iso-8859-2', 'iso-8859-2'], 85 ['iso-ir-101', 'iso-8859-2'], 86 ['iso8859-2', 'iso-8859-2'], 87 ['iso88592', 'iso-8859-2'], 88 ['iso_8859-2', 'iso-8859-2'], 89 ['iso_8859-2:1987', 'iso-8859-2'], 90 ['l2', 'iso-8859-2'], 91 ['latin2', 'iso-8859-2'], 92 ['csisolatin3', 'iso-8859-3'], 93 ['iso-8859-3', 'iso-8859-3'], 94 ['iso-ir-109', 'iso-8859-3'], 95 ['iso8859-3', 'iso-8859-3'], 96 ['iso88593', 'iso-8859-3'], 97 ['iso_8859-3', 'iso-8859-3'], 98 ['iso_8859-3:1988', 'iso-8859-3'], 99 ['l3', 'iso-8859-3'], 100 ['latin3', 'iso-8859-3'], 101 ['csisolatin4', 'iso-8859-4'], 102 ['iso-8859-4', 'iso-8859-4'], 103 ['iso-ir-110', 'iso-8859-4'], 104 ['iso8859-4', 'iso-8859-4'], 105 ['iso88594', 'iso-8859-4'], 106 ['iso_8859-4', 'iso-8859-4'], 107 ['iso_8859-4:1988', 'iso-8859-4'], 108 ['l4', 'iso-8859-4'], 109 ['latin4', 'iso-8859-4'], 110 ['csisolatincyrillic', 'iso-8859-5'], 111 ['cyrillic', 'iso-8859-5'], 112 ['iso-8859-5', 'iso-8859-5'], 113 ['iso-ir-144', 'iso-8859-5'], 114 ['iso8859-5', 'iso-8859-5'], 115 ['iso88595', 'iso-8859-5'], 116 ['iso_8859-5', 'iso-8859-5'], 117 ['iso_8859-5:1988', 'iso-8859-5'], 118 ['arabic', 'iso-8859-6'], 119 ['asmo-708', 'iso-8859-6'], 120 ['csiso88596e', 'iso-8859-6'], 121 ['csiso88596i', 'iso-8859-6'], 122 ['csisolatinarabic', 'iso-8859-6'], 123 ['ecma-114', 'iso-8859-6'], 124 ['iso-8859-6', 'iso-8859-6'], 125 ['iso-8859-6-e', 'iso-8859-6'], 126 ['iso-8859-6-i', 'iso-8859-6'], 127 ['iso-ir-127', 'iso-8859-6'], 128 ['iso8859-6', 'iso-8859-6'], 129 ['iso88596', 'iso-8859-6'], 130 ['iso_8859-6', 'iso-8859-6'], 131 ['iso_8859-6:1987', 'iso-8859-6'], 132 ['csisolatingreek', 'iso-8859-7'], 133 ['ecma-118', 'iso-8859-7'], 134 ['elot_928', 'iso-8859-7'], 135 ['greek', 'iso-8859-7'], 136 ['greek8', 'iso-8859-7'], 137 ['iso-8859-7', 'iso-8859-7'], 138 ['iso-ir-126', 'iso-8859-7'], 139 ['iso8859-7', 'iso-8859-7'], 140 ['iso88597', 'iso-8859-7'], 141 ['iso_8859-7', 'iso-8859-7'], 142 ['iso_8859-7:1987', 'iso-8859-7'], 143 ['sun_eu_greek', 'iso-8859-7'], 144 ['csiso88598e', 'iso-8859-8'], 145 ['csisolatinhebrew', 'iso-8859-8'], 146 ['hebrew', 'iso-8859-8'], 147 ['iso-8859-8', 'iso-8859-8'], 148 ['iso-8859-8-e', 'iso-8859-8'], 149 ['iso-ir-138', 'iso-8859-8'], 150 ['iso8859-8', 'iso-8859-8'], 151 ['iso88598', 'iso-8859-8'], 152 ['iso_8859-8', 'iso-8859-8'], 153 ['iso_8859-8:1988', 'iso-8859-8'], 154 ['visual', 'iso-8859-8'], 155 ['csiso88598i', 'iso-8859-8-i'], 156 ['iso-8859-8-i', 'iso-8859-8-i'], 157 ['logical', 'iso-8859-8-i'], 158 ['csisolatin6', 'iso-8859-10'], 159 ['iso-8859-10', 'iso-8859-10'], 160 ['iso-ir-157', 'iso-8859-10'], 161 ['iso8859-10', 'iso-8859-10'], 162 ['iso885910', 'iso-8859-10'], 163 ['l6', 'iso-8859-10'], 164 ['latin6', 'iso-8859-10'], 165 ['iso-8859-13', 'iso-8859-13'], 166 ['iso8859-13', 'iso-8859-13'], 167 ['iso885913', 'iso-8859-13'], 168 ['iso-8859-14', 'iso-8859-14'], 169 ['iso8859-14', 'iso-8859-14'], 170 ['iso885914', 'iso-8859-14'], 171 ['csisolatin9', 'iso-8859-15'], 172 ['iso-8859-15', 'iso-8859-15'], 173 ['iso8859-15', 'iso-8859-15'], 174 ['iso885915', 'iso-8859-15'], 175 ['iso_8859-15', 'iso-8859-15'], 176 ['l9', 'iso-8859-15'], 177 ['cskoi8r', 'koi8-r'], 178 ['koi', 'koi8-r'], 179 ['koi8', 'koi8-r'], 180 ['koi8-r', 'koi8-r'], 181 ['koi8_r', 'koi8-r'], 182 ['koi8-ru', 'koi8-u'], 183 ['koi8-u', 'koi8-u'], 184 ['csmacintosh', 'macintosh'], 185 ['mac', 'macintosh'], 186 ['macintosh', 'macintosh'], 187 ['x-mac-roman', 'macintosh'], 188 ['dos-874', 'windows-874'], 189 ['iso-8859-11', 'windows-874'], 190 ['iso8859-11', 'windows-874'], 191 ['iso885911', 'windows-874'], 192 ['tis-620', 'windows-874'], 193 ['windows-874', 'windows-874'], 194 ['cp1250', 'windows-1250'], 195 ['windows-1250', 'windows-1250'], 196 ['x-cp1250', 'windows-1250'], 197 ['cp1251', 'windows-1251'], 198 ['windows-1251', 'windows-1251'], 199 ['x-cp1251', 'windows-1251'], 200 ['ansi_x3.4-1968', 'windows-1252'], 201 ['ascii', 'windows-1252'], 202 ['cp1252', 'windows-1252'], 203 ['cp819', 'windows-1252'], 204 ['csisolatin1', 'windows-1252'], 205 ['ibm819', 'windows-1252'], 206 ['iso-8859-1', 'windows-1252'], 207 ['iso-ir-100', 'windows-1252'], 208 ['iso8859-1', 'windows-1252'], 209 ['iso88591', 'windows-1252'], 210 ['iso_8859-1', 'windows-1252'], 211 ['iso_8859-1:1987', 'windows-1252'], 212 ['l1', 'windows-1252'], 213 ['latin1', 'windows-1252'], 214 ['us-ascii', 'windows-1252'], 215 ['windows-1252', 'windows-1252'], 216 ['x-cp1252', 'windows-1252'], 217 ['cp1253', 'windows-1253'], 218 ['windows-1253', 'windows-1253'], 219 ['x-cp1253', 'windows-1253'], 220 ['cp1254', 'windows-1254'], 221 ['csisolatin5', 'windows-1254'], 222 ['iso-8859-9', 'windows-1254'], 223 ['iso-ir-148', 'windows-1254'], 224 ['iso8859-9', 'windows-1254'], 225 ['iso88599', 'windows-1254'], 226 ['iso_8859-9', 'windows-1254'], 227 ['iso_8859-9:1989', 'windows-1254'], 228 ['l5', 'windows-1254'], 229 ['latin5', 'windows-1254'], 230 ['windows-1254', 'windows-1254'], 231 ['x-cp1254', 'windows-1254'], 232 ['cp1255', 'windows-1255'], 233 ['windows-1255', 'windows-1255'], 234 ['x-cp1255', 'windows-1255'], 235 ['cp1256', 'windows-1256'], 236 ['windows-1256', 'windows-1256'], 237 ['x-cp1256', 'windows-1256'], 238 ['cp1257', 'windows-1257'], 239 ['windows-1257', 'windows-1257'], 240 ['x-cp1257', 'windows-1257'], 241 ['cp1258', 'windows-1258'], 242 ['windows-1258', 'windows-1258'], 243 ['x-cp1258', 'windows-1258'], 244 ['x-mac-cyrillic', 'x-mac-cyrillic'], 245 ['x-mac-ukrainian', 'x-mac-cyrillic'], 246 ['chinese', 'gbk'], 247 ['csgb2312', 'gbk'], 248 ['csiso58gb231280', 'gbk'], 249 ['gb2312', 'gbk'], 250 ['gb_2312', 'gbk'], 251 ['gb_2312-80', 'gbk'], 252 ['gbk', 'gbk'], 253 ['iso-ir-58', 'gbk'], 254 ['x-gbk', 'gbk'], 255 ['gb18030', 'gb18030'], 256 ['big5', 'big5'], 257 ['big5-hkscs', 'big5'], 258 ['cn-big5', 'big5'], 259 ['csbig5', 'big5'], 260 ['x-x-big5', 'big5'], 261 ['cseucpkdfmtjapanese', 'euc-jp'], 262 ['euc-jp', 'euc-jp'], 263 ['x-euc-jp', 'euc-jp'], 264 ['csiso2022jp', 'iso-2022-jp'], 265 ['iso-2022-jp', 'iso-2022-jp'], 266 ['csshiftjis', 'shift_jis'], 267 ['ms932', 'shift_jis'], 268 ['ms_kanji', 'shift_jis'], 269 ['shift-jis', 'shift_jis'], 270 ['shift_jis', 'shift_jis'], 271 ['sjis', 'shift_jis'], 272 ['windows-31j', 'shift_jis'], 273 ['x-sjis', 'shift_jis'], 274 ['cseuckr', 'euc-kr'], 275 ['csksc56011987', 'euc-kr'], 276 ['euc-kr', 'euc-kr'], 277 ['iso-ir-149', 'euc-kr'], 278 ['korean', 'euc-kr'], 279 ['ks_c_5601-1987', 'euc-kr'], 280 ['ks_c_5601-1989', 'euc-kr'], 281 ['ksc5601', 'euc-kr'], 282 ['ksc_5601', 'euc-kr'], 283 ['windows-949', 'euc-kr'], 284 ['utf-16be', 'utf-16be'], 285 ['utf-16le', 'utf-16le'], 286 ['utf-16', 'utf-16le'], 287]); 288 289// Unfortunately, String.prototype.trim also removes non-ascii whitespace, 290// so we have to do this manually 291function trimAsciiWhitespace(label) { 292 let s = 0; 293 let e = label.length; 294 while (s < e && ( 295 label[s] === '\u0009' || 296 label[s] === '\u000a' || 297 label[s] === '\u000c' || 298 label[s] === '\u000d' || 299 label[s] === '\u0020')) { 300 s++; 301 } 302 while (e > s && ( 303 label[e - 1] === '\u0009' || 304 label[e - 1] === '\u000a' || 305 label[e - 1] === '\u000c' || 306 label[e - 1] === '\u000d' || 307 label[e - 1] === '\u0020')) { 308 e--; 309 } 310 return StringPrototypeSlice(label, s, e); 311} 312 313function getEncodingFromLabel(label) { 314 const enc = encodings.get(label); 315 if (enc !== undefined) return enc; 316 return encodings.get(trimAsciiWhitespace(label.toLowerCase())); 317} 318 319const encodeIntoResults = new Uint32Array(2); 320 321class TextEncoder { 322 constructor() { 323 this[kEncoder] = true; 324 } 325 326 get encoding() { 327 validateEncoder(this); 328 return 'utf-8'; 329 } 330 331 encode(input = '') { 332 validateEncoder(this); 333 return encodeUtf8String(`${input}`); 334 } 335 336 encodeInto(src, dest) { 337 validateEncoder(this); 338 validateString(src, 'src'); 339 if (!dest || !isUint8Array(dest)) 340 throw new ERR_INVALID_ARG_TYPE('dest', 'Uint8Array', dest); 341 encodeInto(src, dest, encodeIntoResults); 342 return { read: encodeIntoResults[0], written: encodeIntoResults[1] }; 343 } 344 345 [inspect](depth, opts) { 346 validateEncoder(this); 347 if (typeof depth === 'number' && depth < 0) 348 return this; 349 const ctor = getConstructorOf(this); 350 const obj = ObjectCreate({ 351 constructor: ctor === null ? TextEncoder : ctor 352 }); 353 obj.encoding = this.encoding; 354 // Lazy to avoid circular dependency 355 return require('internal/util/inspect').inspect(obj, opts); 356 } 357} 358 359ObjectDefineProperties( 360 TextEncoder.prototype, { 361 'encode': { enumerable: true }, 362 'encodeInto': { enumerable: true }, 363 'encoding': { enumerable: true }, 364 [SymbolToStringTag]: { configurable: true, value: 'TextEncoder' }, 365 }); 366 367const TextDecoder = 368 internalBinding('config').hasIntl ? 369 makeTextDecoderICU() : 370 makeTextDecoderJS(); 371 372function makeTextDecoderICU() { 373 const { 374 decode: _decode, 375 getConverter, 376 } = internalBinding('icu'); 377 378 class TextDecoder { 379 constructor(encoding = 'utf-8', options = {}) { 380 encoding = `${encoding}`; 381 validateObject(options, 'options', { 382 nullable: true, 383 allowArray: true, 384 allowFunction: true, 385 }); 386 387 const enc = getEncodingFromLabel(encoding); 388 if (enc === undefined) 389 throw new ERR_ENCODING_NOT_SUPPORTED(encoding); 390 391 let flags = 0; 392 if (options !== null) { 393 flags |= options.fatal ? CONVERTER_FLAGS_FATAL : 0; 394 flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; 395 } 396 397 const handle = getConverter(enc, flags); 398 if (handle === undefined) 399 throw new ERR_ENCODING_NOT_SUPPORTED(encoding); 400 401 this[kDecoder] = true; 402 this[kHandle] = handle; 403 this[kFlags] = flags; 404 this[kEncoding] = enc; 405 } 406 407 408 decode(input = empty, options = {}) { 409 validateDecoder(this); 410 if (isAnyArrayBuffer(input)) { 411 input = lazyBuffer().from(input); 412 } else if (!isArrayBufferView(input)) { 413 throw new ERR_INVALID_ARG_TYPE('input', 414 ['ArrayBuffer', 'ArrayBufferView'], 415 input); 416 } 417 validateObject(options, 'options', { 418 nullable: true, 419 allowArray: true, 420 allowFunction: true, 421 }); 422 423 let flags = 0; 424 if (options !== null) 425 flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH; 426 427 const ret = _decode(this[kHandle], input, flags); 428 if (typeof ret === 'number') { 429 throw new ERR_ENCODING_INVALID_ENCODED_DATA(this.encoding, ret); 430 } 431 return ret.toString('ucs2'); 432 } 433 } 434 435 return TextDecoder; 436} 437 438function makeTextDecoderJS() { 439 let StringDecoder; 440 function lazyStringDecoder() { 441 if (StringDecoder === undefined) 442 ({ StringDecoder } = require('string_decoder')); 443 return StringDecoder; 444 } 445 446 const kBOMSeen = Symbol('BOM seen'); 447 448 function hasConverter(encoding) { 449 return encoding === 'utf-8' || encoding === 'utf-16le'; 450 } 451 452 class TextDecoder { 453 constructor(encoding = 'utf-8', options = {}) { 454 encoding = `${encoding}`; 455 validateObject(options, 'options', { 456 nullable: true, 457 allowArray: true, 458 allowFunction: true, 459 }); 460 461 const enc = getEncodingFromLabel(encoding); 462 if (enc === undefined || !hasConverter(enc)) 463 throw new ERR_ENCODING_NOT_SUPPORTED(encoding); 464 465 let flags = 0; 466 if (options !== null) { 467 if (options.fatal) { 468 throw new ERR_NO_ICU('"fatal" option'); 469 } 470 flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; 471 } 472 473 this[kDecoder] = true; 474 // StringDecoder will normalize WHATWG encoding to Node.js encoding. 475 this[kHandle] = new (lazyStringDecoder())(enc); 476 this[kFlags] = flags; 477 this[kEncoding] = enc; 478 this[kBOMSeen] = false; 479 } 480 481 decode(input = empty, options = {}) { 482 validateDecoder(this); 483 if (isAnyArrayBuffer(input)) { 484 input = lazyBuffer().from(input); 485 } else if (isArrayBufferView(input)) { 486 input = lazyBuffer().from(input.buffer, input.byteOffset, 487 input.byteLength); 488 } else { 489 throw new ERR_INVALID_ARG_TYPE('input', 490 ['ArrayBuffer', 'ArrayBufferView'], 491 input); 492 } 493 validateObject(options, 'options', { 494 nullable: true, 495 allowArray: true, 496 allowFunction: true, 497 }); 498 499 if (this[kFlags] & CONVERTER_FLAGS_FLUSH) { 500 this[kBOMSeen] = false; 501 } 502 503 if (options !== null && options.stream) { 504 this[kFlags] &= ~CONVERTER_FLAGS_FLUSH; 505 } else { 506 this[kFlags] |= CONVERTER_FLAGS_FLUSH; 507 } 508 509 let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ? 510 this[kHandle].end(input) : 511 this[kHandle].write(input); 512 513 if (result.length > 0 && 514 !this[kBOMSeen] && 515 !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) { 516 // If the very first result in the stream is a BOM, and we are not 517 // explicitly told to ignore it, then we discard it. 518 if (result[0] === '\ufeff') { 519 result = StringPrototypeSlice(result, 1); 520 } 521 this[kBOMSeen] = true; 522 } 523 524 return result; 525 } 526 } 527 528 return TextDecoder; 529} 530 531// Mix in some shared properties. 532ObjectDefineProperties( 533 TextDecoder.prototype, 534 ObjectGetOwnPropertyDescriptors({ 535 get encoding() { 536 validateDecoder(this); 537 return this[kEncoding]; 538 }, 539 540 get fatal() { 541 validateDecoder(this); 542 return (this[kFlags] & CONVERTER_FLAGS_FATAL) === CONVERTER_FLAGS_FATAL; 543 }, 544 545 get ignoreBOM() { 546 validateDecoder(this); 547 return (this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM) === 548 CONVERTER_FLAGS_IGNORE_BOM; 549 }, 550 551 [inspect](depth, opts) { 552 validateDecoder(this); 553 if (typeof depth === 'number' && depth < 0) 554 return this; 555 const constructor = getConstructorOf(this) || TextDecoder; 556 const obj = ObjectCreate({ constructor }); 557 obj.encoding = this.encoding; 558 obj.fatal = this.fatal; 559 obj.ignoreBOM = this.ignoreBOM; 560 if (opts.showHidden) { 561 obj[kFlags] = this[kFlags]; 562 obj[kHandle] = this[kHandle]; 563 } 564 // Lazy to avoid circular dependency 565 const { inspect } = require('internal/util/inspect'); 566 return `${constructor.name} ${inspect(obj)}`; 567 } 568 }) 569); 570 571ObjectDefineProperties(TextDecoder.prototype, { 572 decode: { enumerable: true }, 573 [inspect]: { enumerable: false }, 574 [SymbolToStringTag]: { 575 configurable: true, 576 value: 'TextDecoder' 577 } 578}); 579 580module.exports = { 581 getEncodingFromLabel, 582 TextDecoder, 583 TextEncoder 584}; 585