1// Copyright Joyent, Inc. and other Node contributors. 2// 3// Permission is hereby granted, free of charge, to any person obtaining a 4// copy of this software and associated documentation files (the 5// "Software"), to deal in the Software without restriction, including 6// without limitation the rights to use, copy, modify, merge, publish, 7// distribute, sublicense, and/or sell copies of the Software, and to permit 8// persons to whom the Software is furnished to do so, subject to the 9// following conditions: 10// 11// The above copyright notice and this permission notice shall be included 12// in all copies or substantial portions of the Software. 13// 14// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 17// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 18// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20// USE OR OTHER DEALINGS IN THE SOFTWARE. 21 22'use strict'; 23 24/*<replacement>*/ 25 26var Buffer = require('safe-buffer').Buffer; 27/*</replacement>*/ 28 29var isEncoding = Buffer.isEncoding || function (encoding) { 30 encoding = '' + encoding; 31 switch (encoding && encoding.toLowerCase()) { 32 case 'hex':case 'utf8':case 'utf-8':case 'ascii':case 'binary':case 'base64':case 'ucs2':case 'ucs-2':case 'utf16le':case 'utf-16le':case 'raw': 33 return true; 34 default: 35 return false; 36 } 37}; 38 39function _normalizeEncoding(enc) { 40 if (!enc) return 'utf8'; 41 var retried; 42 while (true) { 43 switch (enc) { 44 case 'utf8': 45 case 'utf-8': 46 return 'utf8'; 47 case 'ucs2': 48 case 'ucs-2': 49 case 'utf16le': 50 case 'utf-16le': 51 return 'utf16le'; 52 case 'latin1': 53 case 'binary': 54 return 'latin1'; 55 case 'base64': 56 case 'ascii': 57 case 'hex': 58 return enc; 59 default: 60 if (retried) return; // undefined 61 enc = ('' + enc).toLowerCase(); 62 retried = true; 63 } 64 } 65}; 66 67// Do not cache `Buffer.isEncoding` when checking encoding names as some 68// modules monkey-patch it to support additional encodings 69function normalizeEncoding(enc) { 70 var nenc = _normalizeEncoding(enc); 71 if (typeof nenc !== 'string' && (Buffer.isEncoding === isEncoding || !isEncoding(enc))) throw new Error('Unknown encoding: ' + enc); 72 return nenc || enc; 73} 74 75// StringDecoder provides an interface for efficiently splitting a series of 76// buffers into a series of JS strings without breaking apart multi-byte 77// characters. 78exports.StringDecoder = StringDecoder; 79function StringDecoder(encoding) { 80 this.encoding = normalizeEncoding(encoding); 81 var nb; 82 switch (this.encoding) { 83 case 'utf16le': 84 this.text = utf16Text; 85 this.end = utf16End; 86 nb = 4; 87 break; 88 case 'utf8': 89 this.fillLast = utf8FillLast; 90 nb = 4; 91 break; 92 case 'base64': 93 this.text = base64Text; 94 this.end = base64End; 95 nb = 3; 96 break; 97 default: 98 this.write = simpleWrite; 99 this.end = simpleEnd; 100 return; 101 } 102 this.lastNeed = 0; 103 this.lastTotal = 0; 104 this.lastChar = Buffer.allocUnsafe(nb); 105} 106 107StringDecoder.prototype.write = function (buf) { 108 if (buf.length === 0) return ''; 109 var r; 110 var i; 111 if (this.lastNeed) { 112 r = this.fillLast(buf); 113 if (r === undefined) return ''; 114 i = this.lastNeed; 115 this.lastNeed = 0; 116 } else { 117 i = 0; 118 } 119 if (i < buf.length) return r ? r + this.text(buf, i) : this.text(buf, i); 120 return r || ''; 121}; 122 123StringDecoder.prototype.end = utf8End; 124 125// Returns only complete characters in a Buffer 126StringDecoder.prototype.text = utf8Text; 127 128// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer 129StringDecoder.prototype.fillLast = function (buf) { 130 if (this.lastNeed <= buf.length) { 131 buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); 132 return this.lastChar.toString(this.encoding, 0, this.lastTotal); 133 } 134 buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); 135 this.lastNeed -= buf.length; 136}; 137 138// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a 139// continuation byte. If an invalid byte is detected, -2 is returned. 140function utf8CheckByte(byte) { 141 if (byte <= 0x7F) return 0;else if (byte >> 5 === 0x06) return 2;else if (byte >> 4 === 0x0E) return 3;else if (byte >> 3 === 0x1E) return 4; 142 return byte >> 6 === 0x02 ? -1 : -2; 143} 144 145// Checks at most 3 bytes at the end of a Buffer in order to detect an 146// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) 147// needed to complete the UTF-8 character (if applicable) are returned. 148function utf8CheckIncomplete(self, buf, i) { 149 var j = buf.length - 1; 150 if (j < i) return 0; 151 var nb = utf8CheckByte(buf[j]); 152 if (nb >= 0) { 153 if (nb > 0) self.lastNeed = nb - 1; 154 return nb; 155 } 156 if (--j < i || nb === -2) return 0; 157 nb = utf8CheckByte(buf[j]); 158 if (nb >= 0) { 159 if (nb > 0) self.lastNeed = nb - 2; 160 return nb; 161 } 162 if (--j < i || nb === -2) return 0; 163 nb = utf8CheckByte(buf[j]); 164 if (nb >= 0) { 165 if (nb > 0) { 166 if (nb === 2) nb = 0;else self.lastNeed = nb - 3; 167 } 168 return nb; 169 } 170 return 0; 171} 172 173// Validates as many continuation bytes for a multi-byte UTF-8 character as 174// needed or are available. If we see a non-continuation byte where we expect 175// one, we "replace" the validated continuation bytes we've seen so far with 176// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding 177// behavior. The continuation byte check is included three times in the case 178// where all of the continuation bytes for a character exist in the same buffer. 179// It is also done this way as a slight performance increase instead of using a 180// loop. 181function utf8CheckExtraBytes(self, buf, p) { 182 if ((buf[0] & 0xC0) !== 0x80) { 183 self.lastNeed = 0; 184 return '\ufffd'; 185 } 186 if (self.lastNeed > 1 && buf.length > 1) { 187 if ((buf[1] & 0xC0) !== 0x80) { 188 self.lastNeed = 1; 189 return '\ufffd'; 190 } 191 if (self.lastNeed > 2 && buf.length > 2) { 192 if ((buf[2] & 0xC0) !== 0x80) { 193 self.lastNeed = 2; 194 return '\ufffd'; 195 } 196 } 197 } 198} 199 200// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. 201function utf8FillLast(buf) { 202 var p = this.lastTotal - this.lastNeed; 203 var r = utf8CheckExtraBytes(this, buf, p); 204 if (r !== undefined) return r; 205 if (this.lastNeed <= buf.length) { 206 buf.copy(this.lastChar, p, 0, this.lastNeed); 207 return this.lastChar.toString(this.encoding, 0, this.lastTotal); 208 } 209 buf.copy(this.lastChar, p, 0, buf.length); 210 this.lastNeed -= buf.length; 211} 212 213// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a 214// partial character, the character's bytes are buffered until the required 215// number of bytes are available. 216function utf8Text(buf, i) { 217 var total = utf8CheckIncomplete(this, buf, i); 218 if (!this.lastNeed) return buf.toString('utf8', i); 219 this.lastTotal = total; 220 var end = buf.length - (total - this.lastNeed); 221 buf.copy(this.lastChar, 0, end); 222 return buf.toString('utf8', i, end); 223} 224 225// For UTF-8, a replacement character is added when ending on a partial 226// character. 227function utf8End(buf) { 228 var r = buf && buf.length ? this.write(buf) : ''; 229 if (this.lastNeed) return r + '\ufffd'; 230 return r; 231} 232 233// UTF-16LE typically needs two bytes per character, but even if we have an even 234// number of bytes available, we need to check if we end on a leading/high 235// surrogate. In that case, we need to wait for the next two bytes in order to 236// decode the last character properly. 237function utf16Text(buf, i) { 238 if ((buf.length - i) % 2 === 0) { 239 var r = buf.toString('utf16le', i); 240 if (r) { 241 var c = r.charCodeAt(r.length - 1); 242 if (c >= 0xD800 && c <= 0xDBFF) { 243 this.lastNeed = 2; 244 this.lastTotal = 4; 245 this.lastChar[0] = buf[buf.length - 2]; 246 this.lastChar[1] = buf[buf.length - 1]; 247 return r.slice(0, -1); 248 } 249 } 250 return r; 251 } 252 this.lastNeed = 1; 253 this.lastTotal = 2; 254 this.lastChar[0] = buf[buf.length - 1]; 255 return buf.toString('utf16le', i, buf.length - 1); 256} 257 258// For UTF-16LE we do not explicitly append special replacement characters if we 259// end on a partial character, we simply let v8 handle that. 260function utf16End(buf) { 261 var r = buf && buf.length ? this.write(buf) : ''; 262 if (this.lastNeed) { 263 var end = this.lastTotal - this.lastNeed; 264 return r + this.lastChar.toString('utf16le', 0, end); 265 } 266 return r; 267} 268 269function base64Text(buf, i) { 270 var n = (buf.length - i) % 3; 271 if (n === 0) return buf.toString('base64', i); 272 this.lastNeed = 3 - n; 273 this.lastTotal = 3; 274 if (n === 1) { 275 this.lastChar[0] = buf[buf.length - 1]; 276 } else { 277 this.lastChar[0] = buf[buf.length - 2]; 278 this.lastChar[1] = buf[buf.length - 1]; 279 } 280 return buf.toString('base64', i, buf.length - n); 281} 282 283function base64End(buf) { 284 var r = buf && buf.length ? this.write(buf) : ''; 285 if (this.lastNeed) return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed); 286 return r; 287} 288 289// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) 290function simpleWrite(buf) { 291 return buf.toString(this.encoding); 292} 293 294function simpleEnd(buf) { 295 return buf && buf.length ? this.write(buf) : ''; 296}