• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "string_decoder.h"  // NOLINT(build/include_inline)
2 #include "string_decoder-inl.h"
3 
4 #include "env-inl.h"
5 #include "node_buffer.h"
6 #include "node_errors.h"
7 #include "node_external_reference.h"
8 #include "string_bytes.h"
9 #include "util.h"
10 
11 using v8::Array;
12 using v8::ArrayBufferView;
13 using v8::Context;
14 using v8::FunctionCallbackInfo;
15 using v8::Integer;
16 using v8::Isolate;
17 using v8::Local;
18 using v8::MaybeLocal;
19 using v8::Object;
20 using v8::String;
21 using v8::Value;
22 
23 namespace node {
24 
25 namespace {
26 
MakeString(Isolate * isolate,const char * data,size_t length,enum encoding encoding)27 MaybeLocal<String> MakeString(Isolate* isolate,
28                               const char* data,
29                               size_t length,
30                               enum encoding encoding) {
31   Local<Value> error;
32   MaybeLocal<Value> ret;
33   if (encoding == UTF8) {
34     MaybeLocal<String> utf8_string = String::NewFromUtf8(
35         isolate,
36         data,
37         v8::NewStringType::kNormal,
38         length);
39     if (utf8_string.IsEmpty()) {
40       isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate));
41       return MaybeLocal<String>();
42     } else {
43       return utf8_string;
44     }
45   } else {
46     ret = StringBytes::Encode(
47         isolate,
48         data,
49         length,
50         encoding,
51         &error);
52   }
53 
54   if (ret.IsEmpty()) {
55     CHECK(!error.IsEmpty());
56     isolate->ThrowException(error);
57   }
58 
59   DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
60   return ret.FromMaybe(Local<Value>()).As<String>();
61 }
62 
63 }  // anonymous namespace
64 
65 
DecodeData(Isolate * isolate,const char * data,size_t * nread_ptr)66 MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
67                                              const char* data,
68                                              size_t* nread_ptr) {
69   Local<String> prepend, body;
70 
71   size_t nread = *nread_ptr;
72 
73   if (Encoding() == UTF8 ||
74       Encoding() == UCS2 ||
75       Encoding() == BASE64 ||
76       Encoding() == BASE64URL) {
77     // See if we want bytes to finish a character from the previous
78     // chunk; if so, copy the new bytes to the missing bytes buffer
79     // and create a small string from it that is to be prepended to the
80     // main body.
81     if (MissingBytes() > 0) {
82       // There are never more bytes missing than the pre-calculated maximum.
83       CHECK_LE(MissingBytes() + BufferedBytes(),
84                kIncompleteCharactersEnd);
85       if (Encoding() == UTF8) {
86         // For UTF-8, we need special treatment to align with the V8 decoder:
87         // If an incomplete character is found at a chunk boundary, we use
88         // its remainder and pass it to V8 as-is.
89         for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
90           if ((data[i] & 0xC0) != 0x80) {
91             // This byte is not a continuation byte even though it should have
92             // been one. We stop decoding of the incomplete character at this
93             // point (but still use the rest of the incomplete bytes from this
94             // chunk) and assume that the new, unexpected byte starts a new one.
95             state_[kMissingBytes] = 0;
96             memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
97             state_[kBufferedBytes] += i;
98             data += i;
99             nread -= i;
100             break;
101           }
102         }
103       }
104 
105       size_t found_bytes =
106           std::min(nread, static_cast<size_t>(MissingBytes()));
107       memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
108              data,
109              found_bytes);
110       // Adjust the two buffers.
111       data += found_bytes;
112       nread -= found_bytes;
113 
114       state_[kMissingBytes] -= found_bytes;
115       state_[kBufferedBytes] += found_bytes;
116 
117       if (LIKELY(MissingBytes() == 0)) {
118         // If no more bytes are missing, create a small string that we
119         // will later prepend.
120         if (!MakeString(isolate,
121                         IncompleteCharacterBuffer(),
122                         BufferedBytes(),
123                         Encoding()).ToLocal(&prepend)) {
124           return MaybeLocal<String>();
125         }
126 
127         *nread_ptr += BufferedBytes();
128         // No more buffered bytes.
129         state_[kBufferedBytes] = 0;
130       }
131     }
132 
133     // It could be that trying to finish the previous chunk already
134     // consumed all data that we received in this chunk.
135     if (UNLIKELY(nread == 0)) {
136       body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
137       prepend = Local<String>();
138     } else {
139       // If not, that means is no character left to finish at this point.
140       DCHECK_EQ(MissingBytes(), 0);
141       DCHECK_EQ(BufferedBytes(), 0);
142 
143       // See whether there is a character that we may have to cut off and
144       // finish when receiving the next chunk.
145       if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
146         // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
147         // This means we'll need to figure out where the character to which
148         // the byte belongs begins.
149         for (size_t i = nread - 1; ; --i) {
150           DCHECK_LT(i, nread);
151           state_[kBufferedBytes]++;
152           if ((data[i] & 0xC0) == 0x80) {
153             // This byte does not start a character (a "trailing" byte).
154             if (state_[kBufferedBytes] >= 4 || i == 0) {
155               // We either have more then 4 trailing bytes (which means
156               // the current character would not be inside the range for
157               // valid Unicode, and in particular cannot be represented
158               // through JavaScript's UTF-16-based approach to strings), or the
159               // current buffer does not contain the start of an UTF-8 character
160               // at all. Either way, this is invalid UTF8 and we can just
161               // let the engine's decoder handle it.
162               state_[kBufferedBytes] = 0;
163               break;
164             }
165           } else {
166             // Found the first byte of a UTF-8 character. By looking at the
167             // upper bits we can tell how long the character *should* be.
168             if ((data[i] & 0xE0) == 0xC0) {
169               state_[kMissingBytes] = 2;
170             } else if ((data[i] & 0xF0) == 0xE0) {
171               state_[kMissingBytes] = 3;
172             } else if ((data[i] & 0xF8) == 0xF0) {
173               state_[kMissingBytes] = 4;
174             } else {
175               // This lead byte would indicate a character outside of the
176               // representable range.
177               state_[kBufferedBytes] = 0;
178               break;
179             }
180 
181             if (BufferedBytes() >= MissingBytes()) {
182               // Received more or exactly as many trailing bytes than the lead
183               // character would indicate. In the "==" case, we have valid
184               // data and don't need to slice anything off;
185               // in the ">" case, this is invalid UTF-8 anyway.
186               state_[kMissingBytes] = 0;
187               state_[kBufferedBytes] = 0;
188             }
189 
190             state_[kMissingBytes] -= state_[kBufferedBytes];
191             break;
192           }
193         }
194       } else if (Encoding() == UCS2) {
195         if ((nread % 2) == 1) {
196           // We got half a codepoint, and need the second byte of it.
197           state_[kBufferedBytes] = 1;
198           state_[kMissingBytes] = 1;
199         } else if ((data[nread - 1] & 0xFC) == 0xD8) {
200           // Half a split UTF-16 character.
201           state_[kBufferedBytes] = 2;
202           state_[kMissingBytes] = 2;
203         }
204       } else if (Encoding() == BASE64 || Encoding() == BASE64URL) {
205         state_[kBufferedBytes] = nread % 3;
206         if (state_[kBufferedBytes] > 0)
207           state_[kMissingBytes] = 3 - BufferedBytes();
208       }
209 
210       if (BufferedBytes() > 0) {
211         // Copy the requested number of buffered bytes from the end of the
212         // input into the incomplete character buffer.
213         nread -= BufferedBytes();
214         *nread_ptr -= BufferedBytes();
215         memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
216       }
217 
218       if (nread > 0) {
219         if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
220           return MaybeLocal<String>();
221       } else {
222         body = String::Empty(isolate);
223       }
224     }
225 
226     if (prepend.IsEmpty()) {
227       return body;
228     } else {
229       return String::Concat(isolate, prepend, body);
230     }
231   } else {
232     CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
233     return MakeString(isolate, data, nread, Encoding());
234   }
235 }
236 
FlushData(Isolate * isolate)237 MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
238   if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
239     CHECK_EQ(MissingBytes(), 0);
240     CHECK_EQ(BufferedBytes(), 0);
241   }
242 
243   if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
244     // Ignore a single trailing byte, like the JS decoder does.
245     state_[kMissingBytes]--;
246     state_[kBufferedBytes]--;
247   }
248 
249   if (BufferedBytes() == 0)
250     return String::Empty(isolate);
251 
252   MaybeLocal<String> ret =
253       MakeString(isolate,
254                  IncompleteCharacterBuffer(),
255                  BufferedBytes(),
256                  Encoding());
257 
258   state_[kMissingBytes] = 0;
259   state_[kBufferedBytes] = 0;
260 
261   return ret;
262 }
263 
264 namespace {
265 
DecodeData(const FunctionCallbackInfo<Value> & args)266 void DecodeData(const FunctionCallbackInfo<Value>& args) {
267   StringDecoder* decoder =
268       reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
269   CHECK_NOT_NULL(decoder);
270 
271   CHECK(args[1]->IsArrayBufferView());
272   ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
273   size_t length = content.length();
274 
275   MaybeLocal<String> ret =
276       decoder->DecodeData(args.GetIsolate(), content.data(), &length);
277   if (!ret.IsEmpty())
278     args.GetReturnValue().Set(ret.ToLocalChecked());
279 }
280 
FlushData(const FunctionCallbackInfo<Value> & args)281 void FlushData(const FunctionCallbackInfo<Value>& args) {
282   StringDecoder* decoder =
283       reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
284   CHECK_NOT_NULL(decoder);
285   MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
286   if (!ret.IsEmpty())
287     args.GetReturnValue().Set(ret.ToLocalChecked());
288 }
289 
InitializeStringDecoder(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)290 void InitializeStringDecoder(Local<Object> target,
291                              Local<Value> unused,
292                              Local<Context> context,
293                              void* priv) {
294   Environment* env = Environment::GetCurrent(context);
295   Isolate* isolate = env->isolate();
296 
297 #define SET_DECODER_CONSTANT(name)                                            \
298   target->Set(context,                                                        \
299               FIXED_ONE_BYTE_STRING(isolate, #name),                          \
300               Integer::New(isolate, StringDecoder::name)).FromJust()
301 
302   SET_DECODER_CONSTANT(kIncompleteCharactersStart);
303   SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
304   SET_DECODER_CONSTANT(kMissingBytes);
305   SET_DECODER_CONSTANT(kBufferedBytes);
306   SET_DECODER_CONSTANT(kEncodingField);
307   SET_DECODER_CONSTANT(kNumFields);
308 
309   Local<Array> encodings = Array::New(isolate);
310 #define ADD_TO_ENCODINGS_ARRAY(cname, jsname)                                 \
311   encodings->Set(context,                                                     \
312                  static_cast<int32_t>(cname),                                 \
313                  FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
314   ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
315   ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
316   ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
317   ADD_TO_ENCODINGS_ARRAY(BASE64URL, "base64url");
318   ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
319   ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
320   ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
321   ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
322 
323   target->Set(context,
324               FIXED_ONE_BYTE_STRING(isolate, "encodings"),
325               encodings).Check();
326 
327   target->Set(context,
328               FIXED_ONE_BYTE_STRING(isolate, "kSize"),
329               Integer::New(isolate, sizeof(StringDecoder))).Check();
330 
331   SetMethod(context, target, "decode", DecodeData);
332   SetMethod(context, target, "flush", FlushData);
333 }
334 
335 }  // anonymous namespace
336 
RegisterStringDecoderExternalReferences(ExternalReferenceRegistry * registry)337 void RegisterStringDecoderExternalReferences(
338     ExternalReferenceRegistry* registry) {
339   registry->Register(DecodeData);
340   registry->Register(FlushData);
341 }
342 
343 }  // namespace node
344 
345 NODE_BINDING_CONTEXT_AWARE_INTERNAL(string_decoder,
346                                     node::InitializeStringDecoder)
347 NODE_BINDING_EXTERNAL_REFERENCE(string_decoder,
348                                 node::RegisterStringDecoderExternalReferences)
349