1'use strict'; 2 3// An implementation of the WHATWG Encoding Standard 4// https://encoding.spec.whatwg.org 5 6const { 7 Map, 8 ObjectCreate, 9 ObjectDefineProperties, 10 ObjectGetOwnPropertyDescriptors, 11 Symbol, 12 SymbolToStringTag, 13 Uint32Array, 14 Uint8Array, 15} = primordials; 16 17const { 18 ERR_ENCODING_INVALID_ENCODED_DATA, 19 ERR_ENCODING_NOT_SUPPORTED, 20 ERR_INVALID_ARG_TYPE, 21 ERR_INVALID_THIS, 22 ERR_NO_ICU 23} = require('internal/errors').codes; 24const kHandle = Symbol('handle'); 25const kFlags = Symbol('flags'); 26const kEncoding = Symbol('encoding'); 27const kDecoder = Symbol('decoder'); 28const kEncoder = Symbol('encoder'); 29 30const { 31 getConstructorOf, 32 customInspectSymbol: inspect 33} = require('internal/util'); 34 35const { 36 isAnyArrayBuffer, 37 isArrayBufferView, 38 isUint8Array 39} = require('internal/util/types'); 40 41const { validateString } = require('internal/validators'); 42 43const { 44 encodeInto, 45 encodeUtf8String 46} = internalBinding('buffer'); 47 48let Buffer; 49function lazyBuffer() { 50 if (Buffer === undefined) 51 Buffer = require('buffer').Buffer; 52 return Buffer; 53} 54 55function validateEncoder(obj) { 56 if (obj == null || obj[kEncoder] !== true) 57 throw new ERR_INVALID_THIS('TextEncoder'); 58} 59 60function validateDecoder(obj) { 61 if (obj == null || obj[kDecoder] !== true) 62 throw new ERR_INVALID_THIS('TextDecoder'); 63} 64 65function validateArgument(prop, expected, propName, expectedName) { 66 if (typeof prop !== expected) 67 throw new ERR_INVALID_ARG_TYPE(propName, expectedName, prop); 68} 69 70const CONVERTER_FLAGS_FLUSH = 0x1; 71const CONVERTER_FLAGS_FATAL = 0x2; 72const CONVERTER_FLAGS_IGNORE_BOM = 0x4; 73 74const empty = new Uint8Array(0); 75 76const encodings = new Map([ 77 ['unicode-1-1-utf-8', 'utf-8'], 78 ['utf8', 'utf-8'], 79 ['utf-8', 'utf-8'], 80 ['866', 'ibm866'], 81 ['cp866', 'ibm866'], 82 ['csibm866', 'ibm866'], 83 ['ibm866', 'ibm866'], 84 ['csisolatin2', 'iso-8859-2'], 85 ['iso-8859-2', 'iso-8859-2'], 86 ['iso-ir-101', 'iso-8859-2'], 87 ['iso8859-2', 'iso-8859-2'], 88 ['iso88592', 'iso-8859-2'], 89 ['iso_8859-2', 'iso-8859-2'], 90 ['iso_8859-2:1987', 'iso-8859-2'], 91 ['l2', 'iso-8859-2'], 92 ['latin2', 'iso-8859-2'], 93 ['csisolatin3', 'iso-8859-3'], 94 ['iso-8859-3', 'iso-8859-3'], 95 ['iso-ir-109', 'iso-8859-3'], 96 ['iso8859-3', 'iso-8859-3'], 97 ['iso88593', 'iso-8859-3'], 98 ['iso_8859-3', 'iso-8859-3'], 99 ['iso_8859-3:1988', 'iso-8859-3'], 100 ['l3', 'iso-8859-3'], 101 ['latin3', 'iso-8859-3'], 102 ['csisolatin4', 'iso-8859-4'], 103 ['iso-8859-4', 'iso-8859-4'], 104 ['iso-ir-110', 'iso-8859-4'], 105 ['iso8859-4', 'iso-8859-4'], 106 ['iso88594', 'iso-8859-4'], 107 ['iso_8859-4', 'iso-8859-4'], 108 ['iso_8859-4:1988', 'iso-8859-4'], 109 ['l4', 'iso-8859-4'], 110 ['latin4', 'iso-8859-4'], 111 ['csisolatincyrillic', 'iso-8859-5'], 112 ['cyrillic', 'iso-8859-5'], 113 ['iso-8859-5', 'iso-8859-5'], 114 ['iso-ir-144', 'iso-8859-5'], 115 ['iso8859-5', 'iso-8859-5'], 116 ['iso88595', 'iso-8859-5'], 117 ['iso_8859-5', 'iso-8859-5'], 118 ['iso_8859-5:1988', 'iso-8859-5'], 119 ['arabic', 'iso-8859-6'], 120 ['asmo-708', 'iso-8859-6'], 121 ['csiso88596e', 'iso-8859-6'], 122 ['csiso88596i', 'iso-8859-6'], 123 ['csisolatinarabic', 'iso-8859-6'], 124 ['ecma-114', 'iso-8859-6'], 125 ['iso-8859-6', 'iso-8859-6'], 126 ['iso-8859-6-e', 'iso-8859-6'], 127 ['iso-8859-6-i', 'iso-8859-6'], 128 ['iso-ir-127', 'iso-8859-6'], 129 ['iso8859-6', 'iso-8859-6'], 130 ['iso88596', 'iso-8859-6'], 131 ['iso_8859-6', 'iso-8859-6'], 132 ['iso_8859-6:1987', 'iso-8859-6'], 133 ['csisolatingreek', 'iso-8859-7'], 134 ['ecma-118', 'iso-8859-7'], 135 ['elot_928', 'iso-8859-7'], 136 ['greek', 'iso-8859-7'], 137 ['greek8', 'iso-8859-7'], 138 ['iso-8859-7', 'iso-8859-7'], 139 ['iso-ir-126', 'iso-8859-7'], 140 ['iso8859-7', 'iso-8859-7'], 141 ['iso88597', 'iso-8859-7'], 142 ['iso_8859-7', 'iso-8859-7'], 143 ['iso_8859-7:1987', 'iso-8859-7'], 144 ['sun_eu_greek', 'iso-8859-7'], 145 ['csiso88598e', 'iso-8859-8'], 146 ['csisolatinhebrew', 'iso-8859-8'], 147 ['hebrew', 'iso-8859-8'], 148 ['iso-8859-8', 'iso-8859-8'], 149 ['iso-8859-8-e', 'iso-8859-8'], 150 ['iso-ir-138', 'iso-8859-8'], 151 ['iso8859-8', 'iso-8859-8'], 152 ['iso88598', 'iso-8859-8'], 153 ['iso_8859-8', 'iso-8859-8'], 154 ['iso_8859-8:1988', 'iso-8859-8'], 155 ['visual', 'iso-8859-8'], 156 ['csiso88598i', 'iso-8859-8-i'], 157 ['iso-8859-8-i', 'iso-8859-8-i'], 158 ['logical', 'iso-8859-8-i'], 159 ['csisolatin6', 'iso-8859-10'], 160 ['iso-8859-10', 'iso-8859-10'], 161 ['iso-ir-157', 'iso-8859-10'], 162 ['iso8859-10', 'iso-8859-10'], 163 ['iso885910', 'iso-8859-10'], 164 ['l6', 'iso-8859-10'], 165 ['latin6', 'iso-8859-10'], 166 ['iso-8859-13', 'iso-8859-13'], 167 ['iso8859-13', 'iso-8859-13'], 168 ['iso885913', 'iso-8859-13'], 169 ['iso-8859-14', 'iso-8859-14'], 170 ['iso8859-14', 'iso-8859-14'], 171 ['iso885914', 'iso-8859-14'], 172 ['csisolatin9', 'iso-8859-15'], 173 ['iso-8859-15', 'iso-8859-15'], 174 ['iso8859-15', 'iso-8859-15'], 175 ['iso885915', 'iso-8859-15'], 176 ['iso_8859-15', 'iso-8859-15'], 177 ['l9', 'iso-8859-15'], 178 ['cskoi8r', 'koi8-r'], 179 ['koi', 'koi8-r'], 180 ['koi8', 'koi8-r'], 181 ['koi8-r', 'koi8-r'], 182 ['koi8_r', 'koi8-r'], 183 ['koi8-ru', 'koi8-u'], 184 ['koi8-u', 'koi8-u'], 185 ['csmacintosh', 'macintosh'], 186 ['mac', 'macintosh'], 187 ['macintosh', 'macintosh'], 188 ['x-mac-roman', 'macintosh'], 189 ['dos-874', 'windows-874'], 190 ['iso-8859-11', 'windows-874'], 191 ['iso8859-11', 'windows-874'], 192 ['iso885911', 'windows-874'], 193 ['tis-620', 'windows-874'], 194 ['windows-874', 'windows-874'], 195 ['cp1250', 'windows-1250'], 196 ['windows-1250', 'windows-1250'], 197 ['x-cp1250', 'windows-1250'], 198 ['cp1251', 'windows-1251'], 199 ['windows-1251', 'windows-1251'], 200 ['x-cp1251', 'windows-1251'], 201 ['ansi_x3.4-1968', 'windows-1252'], 202 ['ascii', 'windows-1252'], 203 ['cp1252', 'windows-1252'], 204 ['cp819', 'windows-1252'], 205 ['csisolatin1', 'windows-1252'], 206 ['ibm819', 'windows-1252'], 207 ['iso-8859-1', 'windows-1252'], 208 ['iso-ir-100', 'windows-1252'], 209 ['iso8859-1', 'windows-1252'], 210 ['iso88591', 'windows-1252'], 211 ['iso_8859-1', 'windows-1252'], 212 ['iso_8859-1:1987', 'windows-1252'], 213 ['l1', 'windows-1252'], 214 ['latin1', 'windows-1252'], 215 ['us-ascii', 'windows-1252'], 216 ['windows-1252', 'windows-1252'], 217 ['x-cp1252', 'windows-1252'], 218 ['cp1253', 'windows-1253'], 219 ['windows-1253', 'windows-1253'], 220 ['x-cp1253', 'windows-1253'], 221 ['cp1254', 'windows-1254'], 222 ['csisolatin5', 'windows-1254'], 223 ['iso-8859-9', 'windows-1254'], 224 ['iso-ir-148', 'windows-1254'], 225 ['iso8859-9', 'windows-1254'], 226 ['iso88599', 'windows-1254'], 227 ['iso_8859-9', 'windows-1254'], 228 ['iso_8859-9:1989', 'windows-1254'], 229 ['l5', 'windows-1254'], 230 ['latin5', 'windows-1254'], 231 ['windows-1254', 'windows-1254'], 232 ['x-cp1254', 'windows-1254'], 233 ['cp1255', 'windows-1255'], 234 ['windows-1255', 'windows-1255'], 235 ['x-cp1255', 'windows-1255'], 236 ['cp1256', 'windows-1256'], 237 ['windows-1256', 'windows-1256'], 238 ['x-cp1256', 'windows-1256'], 239 ['cp1257', 'windows-1257'], 240 ['windows-1257', 'windows-1257'], 241 ['x-cp1257', 'windows-1257'], 242 ['cp1258', 'windows-1258'], 243 ['windows-1258', 'windows-1258'], 244 ['x-cp1258', 'windows-1258'], 245 ['x-mac-cyrillic', 'x-mac-cyrillic'], 246 ['x-mac-ukrainian', 'x-mac-cyrillic'], 247 ['chinese', 'gbk'], 248 ['csgb2312', 'gbk'], 249 ['csiso58gb231280', 'gbk'], 250 ['gb2312', 'gbk'], 251 ['gb_2312', 'gbk'], 252 ['gb_2312-80', 'gbk'], 253 ['gbk', 'gbk'], 254 ['iso-ir-58', 'gbk'], 255 ['x-gbk', 'gbk'], 256 ['gb18030', 'gb18030'], 257 ['big5', 'big5'], 258 ['big5-hkscs', 'big5'], 259 ['cn-big5', 'big5'], 260 ['csbig5', 'big5'], 261 ['x-x-big5', 'big5'], 262 ['cseucpkdfmtjapanese', 'euc-jp'], 263 ['euc-jp', 'euc-jp'], 264 ['x-euc-jp', 'euc-jp'], 265 ['csiso2022jp', 'iso-2022-jp'], 266 ['iso-2022-jp', 'iso-2022-jp'], 267 ['csshiftjis', 'shift_jis'], 268 ['ms932', 'shift_jis'], 269 ['ms_kanji', 'shift_jis'], 270 ['shift-jis', 'shift_jis'], 271 ['shift_jis', 'shift_jis'], 272 ['sjis', 'shift_jis'], 273 ['windows-31j', 'shift_jis'], 274 ['x-sjis', 'shift_jis'], 275 ['cseuckr', 'euc-kr'], 276 ['csksc56011987', 'euc-kr'], 277 ['euc-kr', 'euc-kr'], 278 ['iso-ir-149', 'euc-kr'], 279 ['korean', 'euc-kr'], 280 ['ks_c_5601-1987', 'euc-kr'], 281 ['ks_c_5601-1989', 'euc-kr'], 282 ['ksc5601', 'euc-kr'], 283 ['ksc_5601', 'euc-kr'], 284 ['windows-949', 'euc-kr'], 285 ['utf-16be', 'utf-16be'], 286 ['utf-16le', 'utf-16le'], 287 ['utf-16', 'utf-16le'] 288]); 289 290// Unfortunately, String.prototype.trim also removes non-ascii whitespace, 291// so we have to do this manually 292function trimAsciiWhitespace(label) { 293 let s = 0; 294 let e = label.length; 295 while (s < e && ( 296 label[s] === '\u0009' || 297 label[s] === '\u000a' || 298 label[s] === '\u000c' || 299 label[s] === '\u000d' || 300 label[s] === '\u0020')) { 301 s++; 302 } 303 while (e > s && ( 304 label[e - 1] === '\u0009' || 305 label[e - 1] === '\u000a' || 306 label[e - 1] === '\u000c' || 307 label[e - 1] === '\u000d' || 308 label[e - 1] === '\u0020')) { 309 e--; 310 } 311 return label.slice(s, e); 312} 313 314function getEncodingFromLabel(label) { 315 const enc = encodings.get(label); 316 if (enc !== undefined) return enc; 317 return encodings.get(trimAsciiWhitespace(label.toLowerCase())); 318} 319 320const encodeIntoResults = new Uint32Array(2); 321 322class TextEncoder { 323 constructor() { 324 this[kEncoder] = true; 325 } 326 327 get encoding() { 328 validateEncoder(this); 329 return 'utf-8'; 330 } 331 332 encode(input = '') { 333 validateEncoder(this); 334 return encodeUtf8String(`${input}`); 335 } 336 337 encodeInto(src, dest) { 338 validateEncoder(this); 339 validateString(src, 'src'); 340 if (!dest || !isUint8Array(dest)) 341 throw new ERR_INVALID_ARG_TYPE('dest', 'Uint8Array', dest); 342 encodeInto(src, dest, encodeIntoResults); 343 return { read: encodeIntoResults[0], written: encodeIntoResults[1] }; 344 } 345 346 [inspect](depth, opts) { 347 validateEncoder(this); 348 if (typeof depth === 'number' && depth < 0) 349 return this; 350 const ctor = getConstructorOf(this); 351 const obj = ObjectCreate({ 352 constructor: ctor === null ? TextEncoder : ctor 353 }); 354 obj.encoding = this.encoding; 355 // Lazy to avoid circular dependency 356 return require('internal/util/inspect').inspect(obj, opts); 357 } 358} 359 360ObjectDefineProperties( 361 TextEncoder.prototype, { 362 'encode': { enumerable: true }, 363 'encodeInto': { enumerable: true }, 364 'encoding': { enumerable: true }, 365 [SymbolToStringTag]: { 366 configurable: true, 367 value: 'TextEncoder' 368 } }); 369 370const TextDecoder = 371 internalBinding('config').hasIntl ? 372 makeTextDecoderICU() : 373 makeTextDecoderJS(); 374 375function makeTextDecoderICU() { 376 const { 377 decode: _decode, 378 getConverter, 379 } = internalBinding('icu'); 380 381 class TextDecoder { 382 constructor(encoding = 'utf-8', options = {}) { 383 encoding = `${encoding}`; 384 validateArgument(options, 'object', 'options', 'Object'); 385 386 const enc = getEncodingFromLabel(encoding); 387 if (enc === undefined) 388 throw new ERR_ENCODING_NOT_SUPPORTED(encoding); 389 390 let flags = 0; 391 if (options !== null) { 392 flags |= options.fatal ? CONVERTER_FLAGS_FATAL : 0; 393 flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; 394 } 395 396 const handle = getConverter(enc, flags); 397 if (handle === undefined) 398 throw new ERR_ENCODING_NOT_SUPPORTED(encoding); 399 400 this[kDecoder] = true; 401 this[kHandle] = handle; 402 this[kFlags] = flags; 403 this[kEncoding] = enc; 404 } 405 406 407 decode(input = empty, options = {}) { 408 validateDecoder(this); 409 if (isAnyArrayBuffer(input)) { 410 input = lazyBuffer().from(input); 411 } else if (!isArrayBufferView(input)) { 412 throw new ERR_INVALID_ARG_TYPE('input', 413 ['ArrayBuffer', 'ArrayBufferView'], 414 input); 415 } 416 validateArgument(options, 'object', 'options', 'Object'); 417 418 let flags = 0; 419 if (options !== null) 420 flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH; 421 422 const ret = _decode(this[kHandle], input, flags); 423 if (typeof ret === 'number') { 424 throw new ERR_ENCODING_INVALID_ENCODED_DATA(this.encoding, ret); 425 } 426 return ret.toString('ucs2'); 427 } 428 } 429 430 return TextDecoder; 431} 432 433function makeTextDecoderJS() { 434 let StringDecoder; 435 function lazyStringDecoder() { 436 if (StringDecoder === undefined) 437 ({ StringDecoder } = require('string_decoder')); 438 return StringDecoder; 439 } 440 441 const kBOMSeen = Symbol('BOM seen'); 442 443 function hasConverter(encoding) { 444 return encoding === 'utf-8' || encoding === 'utf-16le'; 445 } 446 447 class TextDecoder { 448 constructor(encoding = 'utf-8', options = {}) { 449 encoding = `${encoding}`; 450 validateArgument(options, 'object', 'options', 'Object'); 451 452 const enc = getEncodingFromLabel(encoding); 453 if (enc === undefined || !hasConverter(enc)) 454 throw new ERR_ENCODING_NOT_SUPPORTED(encoding); 455 456 let flags = 0; 457 if (options !== null) { 458 if (options.fatal) { 459 throw new ERR_NO_ICU('"fatal" option'); 460 } 461 flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; 462 } 463 464 this[kDecoder] = true; 465 // StringDecoder will normalize WHATWG encoding to Node.js encoding. 466 this[kHandle] = new (lazyStringDecoder())(enc); 467 this[kFlags] = flags; 468 this[kEncoding] = enc; 469 this[kBOMSeen] = false; 470 } 471 472 decode(input = empty, options = {}) { 473 validateDecoder(this); 474 if (isAnyArrayBuffer(input)) { 475 input = lazyBuffer().from(input); 476 } else if (isArrayBufferView(input)) { 477 input = lazyBuffer().from(input.buffer, input.byteOffset, 478 input.byteLength); 479 } else { 480 throw new ERR_INVALID_ARG_TYPE('input', 481 ['ArrayBuffer', 'ArrayBufferView'], 482 input); 483 } 484 validateArgument(options, 'object', 'options', 'Object'); 485 486 if (this[kFlags] & CONVERTER_FLAGS_FLUSH) { 487 this[kBOMSeen] = false; 488 } 489 490 if (options !== null && options.stream) { 491 this[kFlags] &= ~CONVERTER_FLAGS_FLUSH; 492 } else { 493 this[kFlags] |= CONVERTER_FLAGS_FLUSH; 494 } 495 496 let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ? 497 this[kHandle].end(input) : 498 this[kHandle].write(input); 499 500 if (result.length > 0 && 501 !this[kBOMSeen] && 502 !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) { 503 // If the very first result in the stream is a BOM, and we are not 504 // explicitly told to ignore it, then we discard it. 505 if (result[0] === '\ufeff') { 506 result = result.slice(1); 507 } 508 this[kBOMSeen] = true; 509 } 510 511 return result; 512 } 513 } 514 515 return TextDecoder; 516} 517 518// Mix in some shared properties. 519{ 520 ObjectDefineProperties( 521 TextDecoder.prototype, 522 ObjectGetOwnPropertyDescriptors({ 523 get encoding() { 524 validateDecoder(this); 525 return this[kEncoding]; 526 }, 527 528 get fatal() { 529 validateDecoder(this); 530 return (this[kFlags] & CONVERTER_FLAGS_FATAL) === CONVERTER_FLAGS_FATAL; 531 }, 532 533 get ignoreBOM() { 534 validateDecoder(this); 535 return (this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM) === 536 CONVERTER_FLAGS_IGNORE_BOM; 537 }, 538 539 [inspect](depth, opts) { 540 validateDecoder(this); 541 if (typeof depth === 'number' && depth < 0) 542 return this; 543 const ctor = getConstructorOf(this); 544 const obj = ObjectCreate({ 545 constructor: ctor === null ? TextDecoder : ctor 546 }); 547 obj.encoding = this.encoding; 548 obj.fatal = this.fatal; 549 obj.ignoreBOM = this.ignoreBOM; 550 if (opts.showHidden) { 551 obj[kFlags] = this[kFlags]; 552 obj[kHandle] = this[kHandle]; 553 } 554 // Lazy to avoid circular dependency 555 return require('internal/util/inspect').inspect(obj, opts); 556 } 557 })); 558 ObjectDefineProperties(TextDecoder.prototype, { 559 decode: { enumerable: true }, 560 [inspect]: { enumerable: false }, 561 [SymbolToStringTag]: { 562 configurable: true, 563 value: 'TextDecoder' 564 } 565 }); 566} 567 568module.exports = { 569 getEncodingFromLabel, 570 TextDecoder, 571 TextEncoder 572}; 573