1 #include "string_decoder.h" // NOLINT(build/include_inline)
2 #include "string_decoder-inl.h"
3
4 #include "env-inl.h"
5 #include "node_buffer.h"
6 #include "node_errors.h"
7 #include "string_bytes.h"
8 #include "util.h"
9
10 using v8::Array;
11 using v8::ArrayBufferView;
12 using v8::Context;
13 using v8::FunctionCallbackInfo;
14 using v8::Integer;
15 using v8::Isolate;
16 using v8::Local;
17 using v8::MaybeLocal;
18 using v8::Object;
19 using v8::String;
20 using v8::Value;
21
22 namespace node {
23
24 namespace {
25
MakeString(Isolate * isolate,const char * data,size_t length,enum encoding encoding)26 MaybeLocal<String> MakeString(Isolate* isolate,
27 const char* data,
28 size_t length,
29 enum encoding encoding) {
30 Local<Value> error;
31 MaybeLocal<Value> ret;
32 if (encoding == UTF8) {
33 MaybeLocal<String> utf8_string = String::NewFromUtf8(
34 isolate,
35 data,
36 v8::NewStringType::kNormal,
37 length);
38 if (utf8_string.IsEmpty()) {
39 isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate));
40 return MaybeLocal<String>();
41 } else {
42 return utf8_string;
43 }
44 } else {
45 ret = StringBytes::Encode(
46 isolate,
47 data,
48 length,
49 encoding,
50 &error);
51 }
52
53 if (ret.IsEmpty()) {
54 CHECK(!error.IsEmpty());
55 isolate->ThrowException(error);
56 }
57
58 DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
59 return ret.FromMaybe(Local<Value>()).As<String>();
60 }
61
62 } // anonymous namespace
63
64
DecodeData(Isolate * isolate,const char * data,size_t * nread_ptr)65 MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
66 const char* data,
67 size_t* nread_ptr) {
68 Local<String> prepend, body;
69
70 size_t nread = *nread_ptr;
71
72 if (Encoding() == UTF8 ||
73 Encoding() == UCS2 ||
74 Encoding() == BASE64 ||
75 Encoding() == BASE64URL) {
76 // See if we want bytes to finish a character from the previous
77 // chunk; if so, copy the new bytes to the missing bytes buffer
78 // and create a small string from it that is to be prepended to the
79 // main body.
80 if (MissingBytes() > 0) {
81 // There are never more bytes missing than the pre-calculated maximum.
82 CHECK_LE(MissingBytes() + BufferedBytes(),
83 kIncompleteCharactersEnd);
84 if (Encoding() == UTF8) {
85 // For UTF-8, we need special treatment to align with the V8 decoder:
86 // If an incomplete character is found at a chunk boundary, we use
87 // its remainder and pass it to V8 as-is.
88 for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
89 if ((data[i] & 0xC0) != 0x80) {
90 // This byte is not a continuation byte even though it should have
91 // been one. We stop decoding of the incomplete character at this
92 // point (but still use the rest of the incomplete bytes from this
93 // chunk) and assume that the new, unexpected byte starts a new one.
94 state_[kMissingBytes] = 0;
95 memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
96 state_[kBufferedBytes] += i;
97 data += i;
98 nread -= i;
99 break;
100 }
101 }
102 }
103
104 size_t found_bytes =
105 std::min(nread, static_cast<size_t>(MissingBytes()));
106 memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
107 data,
108 found_bytes);
109 // Adjust the two buffers.
110 data += found_bytes;
111 nread -= found_bytes;
112
113 state_[kMissingBytes] -= found_bytes;
114 state_[kBufferedBytes] += found_bytes;
115
116 if (LIKELY(MissingBytes() == 0)) {
117 // If no more bytes are missing, create a small string that we
118 // will later prepend.
119 if (!MakeString(isolate,
120 IncompleteCharacterBuffer(),
121 BufferedBytes(),
122 Encoding()).ToLocal(&prepend)) {
123 return MaybeLocal<String>();
124 }
125
126 *nread_ptr += BufferedBytes();
127 // No more buffered bytes.
128 state_[kBufferedBytes] = 0;
129 }
130 }
131
132 // It could be that trying to finish the previous chunk already
133 // consumed all data that we received in this chunk.
134 if (UNLIKELY(nread == 0)) {
135 body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
136 prepend = Local<String>();
137 } else {
138 // If not, that means is no character left to finish at this point.
139 DCHECK_EQ(MissingBytes(), 0);
140 DCHECK_EQ(BufferedBytes(), 0);
141
142 // See whether there is a character that we may have to cut off and
143 // finish when receiving the next chunk.
144 if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
145 // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
146 // This means we'll need to figure out where the character to which
147 // the byte belongs begins.
148 for (size_t i = nread - 1; ; --i) {
149 DCHECK_LT(i, nread);
150 state_[kBufferedBytes]++;
151 if ((data[i] & 0xC0) == 0x80) {
152 // This byte does not start a character (a "trailing" byte).
153 if (state_[kBufferedBytes] >= 4 || i == 0) {
154 // We either have more then 4 trailing bytes (which means
155 // the current character would not be inside the range for
156 // valid Unicode, and in particular cannot be represented
157 // through JavaScript's UTF-16-based approach to strings), or the
158 // current buffer does not contain the start of an UTF-8 character
159 // at all. Either way, this is invalid UTF8 and we can just
160 // let the engine's decoder handle it.
161 state_[kBufferedBytes] = 0;
162 break;
163 }
164 } else {
165 // Found the first byte of a UTF-8 character. By looking at the
166 // upper bits we can tell how long the character *should* be.
167 if ((data[i] & 0xE0) == 0xC0) {
168 state_[kMissingBytes] = 2;
169 } else if ((data[i] & 0xF0) == 0xE0) {
170 state_[kMissingBytes] = 3;
171 } else if ((data[i] & 0xF8) == 0xF0) {
172 state_[kMissingBytes] = 4;
173 } else {
174 // This lead byte would indicate a character outside of the
175 // representable range.
176 state_[kBufferedBytes] = 0;
177 break;
178 }
179
180 if (BufferedBytes() >= MissingBytes()) {
181 // Received more or exactly as many trailing bytes than the lead
182 // character would indicate. In the "==" case, we have valid
183 // data and don't need to slice anything off;
184 // in the ">" case, this is invalid UTF-8 anyway.
185 state_[kMissingBytes] = 0;
186 state_[kBufferedBytes] = 0;
187 }
188
189 state_[kMissingBytes] -= state_[kBufferedBytes];
190 break;
191 }
192 }
193 } else if (Encoding() == UCS2) {
194 if ((nread % 2) == 1) {
195 // We got half a codepoint, and need the second byte of it.
196 state_[kBufferedBytes] = 1;
197 state_[kMissingBytes] = 1;
198 } else if ((data[nread - 1] & 0xFC) == 0xD8) {
199 // Half a split UTF-16 character.
200 state_[kBufferedBytes] = 2;
201 state_[kMissingBytes] = 2;
202 }
203 } else if (Encoding() == BASE64 || Encoding() == BASE64URL) {
204 state_[kBufferedBytes] = nread % 3;
205 if (state_[kBufferedBytes] > 0)
206 state_[kMissingBytes] = 3 - BufferedBytes();
207 }
208
209 if (BufferedBytes() > 0) {
210 // Copy the requested number of buffered bytes from the end of the
211 // input into the incomplete character buffer.
212 nread -= BufferedBytes();
213 *nread_ptr -= BufferedBytes();
214 memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
215 }
216
217 if (nread > 0) {
218 if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
219 return MaybeLocal<String>();
220 } else {
221 body = String::Empty(isolate);
222 }
223 }
224
225 if (prepend.IsEmpty()) {
226 return body;
227 } else {
228 return String::Concat(isolate, prepend, body);
229 }
230 } else {
231 CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
232 return MakeString(isolate, data, nread, Encoding());
233 }
234 }
235
FlushData(Isolate * isolate)236 MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
237 if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
238 CHECK_EQ(MissingBytes(), 0);
239 CHECK_EQ(BufferedBytes(), 0);
240 }
241
242 if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
243 // Ignore a single trailing byte, like the JS decoder does.
244 state_[kMissingBytes]--;
245 state_[kBufferedBytes]--;
246 }
247
248 if (BufferedBytes() == 0)
249 return String::Empty(isolate);
250
251 MaybeLocal<String> ret =
252 MakeString(isolate,
253 IncompleteCharacterBuffer(),
254 BufferedBytes(),
255 Encoding());
256
257 state_[kMissingBytes] = 0;
258 state_[kBufferedBytes] = 0;
259
260 return ret;
261 }
262
263 namespace {
264
DecodeData(const FunctionCallbackInfo<Value> & args)265 void DecodeData(const FunctionCallbackInfo<Value>& args) {
266 StringDecoder* decoder =
267 reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
268 CHECK_NOT_NULL(decoder);
269
270 CHECK(args[1]->IsArrayBufferView());
271 ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
272 size_t length = content.length();
273
274 MaybeLocal<String> ret =
275 decoder->DecodeData(args.GetIsolate(), content.data(), &length);
276 if (!ret.IsEmpty())
277 args.GetReturnValue().Set(ret.ToLocalChecked());
278 }
279
FlushData(const FunctionCallbackInfo<Value> & args)280 void FlushData(const FunctionCallbackInfo<Value>& args) {
281 StringDecoder* decoder =
282 reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
283 CHECK_NOT_NULL(decoder);
284 MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
285 if (!ret.IsEmpty())
286 args.GetReturnValue().Set(ret.ToLocalChecked());
287 }
288
InitializeStringDecoder(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)289 void InitializeStringDecoder(Local<Object> target,
290 Local<Value> unused,
291 Local<Context> context,
292 void* priv) {
293 Environment* env = Environment::GetCurrent(context);
294 Isolate* isolate = env->isolate();
295
296 #define SET_DECODER_CONSTANT(name) \
297 target->Set(context, \
298 FIXED_ONE_BYTE_STRING(isolate, #name), \
299 Integer::New(isolate, StringDecoder::name)).FromJust()
300
301 SET_DECODER_CONSTANT(kIncompleteCharactersStart);
302 SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
303 SET_DECODER_CONSTANT(kMissingBytes);
304 SET_DECODER_CONSTANT(kBufferedBytes);
305 SET_DECODER_CONSTANT(kEncodingField);
306 SET_DECODER_CONSTANT(kNumFields);
307
308 Local<Array> encodings = Array::New(isolate);
309 #define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \
310 encodings->Set(context, \
311 static_cast<int32_t>(cname), \
312 FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
313 ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
314 ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
315 ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
316 ADD_TO_ENCODINGS_ARRAY(BASE64URL, "base64url");
317 ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
318 ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
319 ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
320 ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
321
322 target->Set(context,
323 FIXED_ONE_BYTE_STRING(isolate, "encodings"),
324 encodings).Check();
325
326 target->Set(context,
327 FIXED_ONE_BYTE_STRING(isolate, "kSize"),
328 Integer::New(isolate, sizeof(StringDecoder))).Check();
329
330 env->SetMethod(target, "decode", DecodeData);
331 env->SetMethod(target, "flush", FlushData);
332 }
333
334 } // anonymous namespace
335
336 } // namespace node
337
338 NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder,
339 node::InitializeStringDecoder)
340