• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "string_decoder.h"  // NOLINT(build/include_inline)
2 #include "string_decoder-inl.h"
3 
4 #include "env-inl.h"
5 #include "node_buffer.h"
6 #include "node_errors.h"
7 #include "string_bytes.h"
8 #include "util.h"
9 
10 using v8::Array;
11 using v8::ArrayBufferView;
12 using v8::Context;
13 using v8::FunctionCallbackInfo;
14 using v8::Integer;
15 using v8::Isolate;
16 using v8::Local;
17 using v8::MaybeLocal;
18 using v8::Object;
19 using v8::String;
20 using v8::Value;
21 
22 namespace node {
23 
24 namespace {
25 
MakeString(Isolate * isolate,const char * data,size_t length,enum encoding encoding)26 MaybeLocal<String> MakeString(Isolate* isolate,
27                               const char* data,
28                               size_t length,
29                               enum encoding encoding) {
30   Local<Value> error;
31   MaybeLocal<Value> ret;
32   if (encoding == UTF8) {
33     MaybeLocal<String> utf8_string = String::NewFromUtf8(
34         isolate,
35         data,
36         v8::NewStringType::kNormal,
37         length);
38     if (utf8_string.IsEmpty()) {
39       isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate));
40       return MaybeLocal<String>();
41     } else {
42       return utf8_string;
43     }
44   } else {
45     ret = StringBytes::Encode(
46         isolate,
47         data,
48         length,
49         encoding,
50         &error);
51   }
52 
53   if (ret.IsEmpty()) {
54     CHECK(!error.IsEmpty());
55     isolate->ThrowException(error);
56   }
57 
58   DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
59   return ret.FromMaybe(Local<Value>()).As<String>();
60 }
61 
62 }  // anonymous namespace
63 
64 
DecodeData(Isolate * isolate,const char * data,size_t * nread_ptr)65 MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
66                                              const char* data,
67                                              size_t* nread_ptr) {
68   Local<String> prepend, body;
69 
70   size_t nread = *nread_ptr;
71 
72   if (Encoding() == UTF8 ||
73       Encoding() == UCS2 ||
74       Encoding() == BASE64 ||
75       Encoding() == BASE64URL) {
76     // See if we want bytes to finish a character from the previous
77     // chunk; if so, copy the new bytes to the missing bytes buffer
78     // and create a small string from it that is to be prepended to the
79     // main body.
80     if (MissingBytes() > 0) {
81       // There are never more bytes missing than the pre-calculated maximum.
82       CHECK_LE(MissingBytes() + BufferedBytes(),
83                kIncompleteCharactersEnd);
84       if (Encoding() == UTF8) {
85         // For UTF-8, we need special treatment to align with the V8 decoder:
86         // If an incomplete character is found at a chunk boundary, we use
87         // its remainder and pass it to V8 as-is.
88         for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
89           if ((data[i] & 0xC0) != 0x80) {
90             // This byte is not a continuation byte even though it should have
91             // been one. We stop decoding of the incomplete character at this
92             // point (but still use the rest of the incomplete bytes from this
93             // chunk) and assume that the new, unexpected byte starts a new one.
94             state_[kMissingBytes] = 0;
95             memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
96             state_[kBufferedBytes] += i;
97             data += i;
98             nread -= i;
99             break;
100           }
101         }
102       }
103 
104       size_t found_bytes =
105           std::min(nread, static_cast<size_t>(MissingBytes()));
106       memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
107              data,
108              found_bytes);
109       // Adjust the two buffers.
110       data += found_bytes;
111       nread -= found_bytes;
112 
113       state_[kMissingBytes] -= found_bytes;
114       state_[kBufferedBytes] += found_bytes;
115 
116       if (LIKELY(MissingBytes() == 0)) {
117         // If no more bytes are missing, create a small string that we
118         // will later prepend.
119         if (!MakeString(isolate,
120                         IncompleteCharacterBuffer(),
121                         BufferedBytes(),
122                         Encoding()).ToLocal(&prepend)) {
123           return MaybeLocal<String>();
124         }
125 
126         *nread_ptr += BufferedBytes();
127         // No more buffered bytes.
128         state_[kBufferedBytes] = 0;
129       }
130     }
131 
132     // It could be that trying to finish the previous chunk already
133     // consumed all data that we received in this chunk.
134     if (UNLIKELY(nread == 0)) {
135       body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
136       prepend = Local<String>();
137     } else {
138       // If not, that means is no character left to finish at this point.
139       DCHECK_EQ(MissingBytes(), 0);
140       DCHECK_EQ(BufferedBytes(), 0);
141 
142       // See whether there is a character that we may have to cut off and
143       // finish when receiving the next chunk.
144       if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
145         // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
146         // This means we'll need to figure out where the character to which
147         // the byte belongs begins.
148         for (size_t i = nread - 1; ; --i) {
149           DCHECK_LT(i, nread);
150           state_[kBufferedBytes]++;
151           if ((data[i] & 0xC0) == 0x80) {
152             // This byte does not start a character (a "trailing" byte).
153             if (state_[kBufferedBytes] >= 4 || i == 0) {
154               // We either have more then 4 trailing bytes (which means
155               // the current character would not be inside the range for
156               // valid Unicode, and in particular cannot be represented
157               // through JavaScript's UTF-16-based approach to strings), or the
158               // current buffer does not contain the start of an UTF-8 character
159               // at all. Either way, this is invalid UTF8 and we can just
160               // let the engine's decoder handle it.
161               state_[kBufferedBytes] = 0;
162               break;
163             }
164           } else {
165             // Found the first byte of a UTF-8 character. By looking at the
166             // upper bits we can tell how long the character *should* be.
167             if ((data[i] & 0xE0) == 0xC0) {
168               state_[kMissingBytes] = 2;
169             } else if ((data[i] & 0xF0) == 0xE0) {
170               state_[kMissingBytes] = 3;
171             } else if ((data[i] & 0xF8) == 0xF0) {
172               state_[kMissingBytes] = 4;
173             } else {
174               // This lead byte would indicate a character outside of the
175               // representable range.
176               state_[kBufferedBytes] = 0;
177               break;
178             }
179 
180             if (BufferedBytes() >= MissingBytes()) {
181               // Received more or exactly as many trailing bytes than the lead
182               // character would indicate. In the "==" case, we have valid
183               // data and don't need to slice anything off;
184               // in the ">" case, this is invalid UTF-8 anyway.
185               state_[kMissingBytes] = 0;
186               state_[kBufferedBytes] = 0;
187             }
188 
189             state_[kMissingBytes] -= state_[kBufferedBytes];
190             break;
191           }
192         }
193       } else if (Encoding() == UCS2) {
194         if ((nread % 2) == 1) {
195           // We got half a codepoint, and need the second byte of it.
196           state_[kBufferedBytes] = 1;
197           state_[kMissingBytes] = 1;
198         } else if ((data[nread - 1] & 0xFC) == 0xD8) {
199           // Half a split UTF-16 character.
200           state_[kBufferedBytes] = 2;
201           state_[kMissingBytes] = 2;
202         }
203       } else if (Encoding() == BASE64 || Encoding() == BASE64URL) {
204         state_[kBufferedBytes] = nread % 3;
205         if (state_[kBufferedBytes] > 0)
206           state_[kMissingBytes] = 3 - BufferedBytes();
207       }
208 
209       if (BufferedBytes() > 0) {
210         // Copy the requested number of buffered bytes from the end of the
211         // input into the incomplete character buffer.
212         nread -= BufferedBytes();
213         *nread_ptr -= BufferedBytes();
214         memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
215       }
216 
217       if (nread > 0) {
218         if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
219           return MaybeLocal<String>();
220       } else {
221         body = String::Empty(isolate);
222       }
223     }
224 
225     if (prepend.IsEmpty()) {
226       return body;
227     } else {
228       return String::Concat(isolate, prepend, body);
229     }
230   } else {
231     CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
232     return MakeString(isolate, data, nread, Encoding());
233   }
234 }
235 
FlushData(Isolate * isolate)236 MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
237   if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
238     CHECK_EQ(MissingBytes(), 0);
239     CHECK_EQ(BufferedBytes(), 0);
240   }
241 
242   if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
243     // Ignore a single trailing byte, like the JS decoder does.
244     state_[kMissingBytes]--;
245     state_[kBufferedBytes]--;
246   }
247 
248   if (BufferedBytes() == 0)
249     return String::Empty(isolate);
250 
251   MaybeLocal<String> ret =
252       MakeString(isolate,
253                  IncompleteCharacterBuffer(),
254                  BufferedBytes(),
255                  Encoding());
256 
257   state_[kMissingBytes] = 0;
258   state_[kBufferedBytes] = 0;
259 
260   return ret;
261 }
262 
263 namespace {
264 
DecodeData(const FunctionCallbackInfo<Value> & args)265 void DecodeData(const FunctionCallbackInfo<Value>& args) {
266   StringDecoder* decoder =
267       reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
268   CHECK_NOT_NULL(decoder);
269 
270   CHECK(args[1]->IsArrayBufferView());
271   ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
272   size_t length = content.length();
273 
274   MaybeLocal<String> ret =
275       decoder->DecodeData(args.GetIsolate(), content.data(), &length);
276   if (!ret.IsEmpty())
277     args.GetReturnValue().Set(ret.ToLocalChecked());
278 }
279 
FlushData(const FunctionCallbackInfo<Value> & args)280 void FlushData(const FunctionCallbackInfo<Value>& args) {
281   StringDecoder* decoder =
282       reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
283   CHECK_NOT_NULL(decoder);
284   MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
285   if (!ret.IsEmpty())
286     args.GetReturnValue().Set(ret.ToLocalChecked());
287 }
288 
InitializeStringDecoder(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)289 void InitializeStringDecoder(Local<Object> target,
290                              Local<Value> unused,
291                              Local<Context> context,
292                              void* priv) {
293   Environment* env = Environment::GetCurrent(context);
294   Isolate* isolate = env->isolate();
295 
296 #define SET_DECODER_CONSTANT(name)                                            \
297   target->Set(context,                                                        \
298               FIXED_ONE_BYTE_STRING(isolate, #name),                          \
299               Integer::New(isolate, StringDecoder::name)).FromJust()
300 
301   SET_DECODER_CONSTANT(kIncompleteCharactersStart);
302   SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
303   SET_DECODER_CONSTANT(kMissingBytes);
304   SET_DECODER_CONSTANT(kBufferedBytes);
305   SET_DECODER_CONSTANT(kEncodingField);
306   SET_DECODER_CONSTANT(kNumFields);
307 
308   Local<Array> encodings = Array::New(isolate);
309 #define ADD_TO_ENCODINGS_ARRAY(cname, jsname)                                 \
310   encodings->Set(context,                                                     \
311                  static_cast<int32_t>(cname),                                 \
312                  FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
313   ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
314   ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
315   ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
316   ADD_TO_ENCODINGS_ARRAY(BASE64URL, "base64url");
317   ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
318   ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
319   ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
320   ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
321 
322   target->Set(context,
323               FIXED_ONE_BYTE_STRING(isolate, "encodings"),
324               encodings).Check();
325 
326   target->Set(context,
327               FIXED_ONE_BYTE_STRING(isolate, "kSize"),
328               Integer::New(isolate, sizeof(StringDecoder))).Check();
329 
330   env->SetMethod(target, "decode", DecodeData);
331   env->SetMethod(target, "flush", FlushData);
332 }
333 
334 }  // anonymous namespace
335 
336 }  // namespace node
337 
338 NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder,
339                                    node::InitializeStringDecoder)
340