• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "string_decoder.h"  // NOLINT(build/include_inline)
2 #include "string_decoder-inl.h"
3 
4 #include "env-inl.h"
5 #include "node_buffer.h"
6 #include "string_bytes.h"
7 #include "util.h"
8 
9 using v8::Array;
10 using v8::ArrayBufferView;
11 using v8::Context;
12 using v8::FunctionCallbackInfo;
13 using v8::Integer;
14 using v8::Isolate;
15 using v8::Local;
16 using v8::MaybeLocal;
17 using v8::Object;
18 using v8::String;
19 using v8::Value;
20 
21 namespace node {
22 
23 namespace {
24 
MakeString(Isolate * isolate,const char * data,size_t length,enum encoding encoding)25 MaybeLocal<String> MakeString(Isolate* isolate,
26                               const char* data,
27                               size_t length,
28                               enum encoding encoding) {
29   Local<Value> error;
30   MaybeLocal<Value> ret;
31   if (encoding == UTF8) {
32     return String::NewFromUtf8(
33         isolate,
34         data,
35         v8::NewStringType::kNormal,
36         length);
37   } else {
38     ret = StringBytes::Encode(
39         isolate,
40         data,
41         length,
42         encoding,
43         &error);
44   }
45 
46   if (ret.IsEmpty()) {
47     CHECK(!error.IsEmpty());
48     isolate->ThrowException(error);
49   }
50 
51   DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
52   return ret.FromMaybe(Local<Value>()).As<String>();
53 }
54 
55 }  // anonymous namespace
56 
57 
DecodeData(Isolate * isolate,const char * data,size_t * nread_ptr)58 MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
59                                              const char* data,
60                                              size_t* nread_ptr) {
61   Local<String> prepend, body;
62 
63   size_t nread = *nread_ptr;
64 
65   if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) {
66     // See if we want bytes to finish a character from the previous
67     // chunk; if so, copy the new bytes to the missing bytes buffer
68     // and create a small string from it that is to be prepended to the
69     // main body.
70     if (MissingBytes() > 0) {
71       // There are never more bytes missing than the pre-calculated maximum.
72       CHECK_LE(MissingBytes() + BufferedBytes(),
73                kIncompleteCharactersEnd);
74       if (Encoding() == UTF8) {
75         // For UTF-8, we need special treatment to align with the V8 decoder:
76         // If an incomplete character is found at a chunk boundary, we use
77         // its remainder and pass it to V8 as-is.
78         for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
79           if ((data[i] & 0xC0) != 0x80) {
80             // This byte is not a continuation byte even though it should have
81             // been one. We stop decoding of the incomplete character at this
82             // point (but still use the rest of the incomplete bytes from this
83             // chunk) and assume that the new, unexpected byte starts a new one.
84             state_[kMissingBytes] = 0;
85             memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
86             state_[kBufferedBytes] += i;
87             data += i;
88             nread -= i;
89             break;
90           }
91         }
92       }
93 
94       size_t found_bytes =
95           std::min(nread, static_cast<size_t>(MissingBytes()));
96       memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
97              data,
98              found_bytes);
99       // Adjust the two buffers.
100       data += found_bytes;
101       nread -= found_bytes;
102 
103       state_[kMissingBytes] -= found_bytes;
104       state_[kBufferedBytes] += found_bytes;
105 
106       if (LIKELY(MissingBytes() == 0)) {
107         // If no more bytes are missing, create a small string that we
108         // will later prepend.
109         if (!MakeString(isolate,
110                         IncompleteCharacterBuffer(),
111                         BufferedBytes(),
112                         Encoding()).ToLocal(&prepend)) {
113           return MaybeLocal<String>();
114         }
115 
116         *nread_ptr += BufferedBytes();
117         // No more buffered bytes.
118         state_[kBufferedBytes] = 0;
119       }
120     }
121 
122     // It could be that trying to finish the previous chunk already
123     // consumed all data that we received in this chunk.
124     if (UNLIKELY(nread == 0)) {
125       body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
126       prepend = Local<String>();
127     } else {
128       // If not, that means is no character left to finish at this point.
129       DCHECK_EQ(MissingBytes(), 0);
130       DCHECK_EQ(BufferedBytes(), 0);
131 
132       // See whether there is a character that we may have to cut off and
133       // finish when receiving the next chunk.
134       if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
135         // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
136         // This means we'll need to figure out where the character to which
137         // the byte belongs begins.
138         for (size_t i = nread - 1; ; --i) {
139           DCHECK_LT(i, nread);
140           state_[kBufferedBytes]++;
141           if ((data[i] & 0xC0) == 0x80) {
142             // This byte does not start a character (a "trailing" byte).
143             if (state_[kBufferedBytes] >= 4 || i == 0) {
144               // We either have more then 4 trailing bytes (which means
145               // the current character would not be inside the range for
146               // valid Unicode, and in particular cannot be represented
147               // through JavaScript's UTF-16-based approach to strings), or the
148               // current buffer does not contain the start of an UTF-8 character
149               // at all. Either way, this is invalid UTF8 and we can just
150               // let the engine's decoder handle it.
151               state_[kBufferedBytes] = 0;
152               break;
153             }
154           } else {
155             // Found the first byte of a UTF-8 character. By looking at the
156             // upper bits we can tell how long the character *should* be.
157             if ((data[i] & 0xE0) == 0xC0) {
158               state_[kMissingBytes] = 2;
159             } else if ((data[i] & 0xF0) == 0xE0) {
160               state_[kMissingBytes] = 3;
161             } else if ((data[i] & 0xF8) == 0xF0) {
162               state_[kMissingBytes] = 4;
163             } else {
164               // This lead byte would indicate a character outside of the
165               // representable range.
166               state_[kBufferedBytes] = 0;
167               break;
168             }
169 
170             if (BufferedBytes() >= MissingBytes()) {
171               // Received more or exactly as many trailing bytes than the lead
172               // character would indicate. In the "==" case, we have valid
173               // data and don't need to slice anything off;
174               // in the ">" case, this is invalid UTF-8 anyway.
175               state_[kMissingBytes] = 0;
176               state_[kBufferedBytes] = 0;
177             }
178 
179             state_[kMissingBytes] -= state_[kBufferedBytes];
180             break;
181           }
182         }
183       } else if (Encoding() == UCS2) {
184         if ((nread % 2) == 1) {
185           // We got half a codepoint, and need the second byte of it.
186           state_[kBufferedBytes] = 1;
187           state_[kMissingBytes] = 1;
188         } else if ((data[nread - 1] & 0xFC) == 0xD8) {
189           // Half a split UTF-16 character.
190           state_[kBufferedBytes] = 2;
191           state_[kMissingBytes] = 2;
192         }
193       } else if (Encoding() == BASE64) {
194         state_[kBufferedBytes] = nread % 3;
195         if (state_[kBufferedBytes] > 0)
196           state_[kMissingBytes] = 3 - BufferedBytes();
197       }
198 
199       if (BufferedBytes() > 0) {
200         // Copy the requested number of buffered bytes from the end of the
201         // input into the incomplete character buffer.
202         nread -= BufferedBytes();
203         *nread_ptr -= BufferedBytes();
204         memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
205       }
206 
207       if (nread > 0) {
208         if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
209           return MaybeLocal<String>();
210       } else {
211         body = String::Empty(isolate);
212       }
213     }
214 
215     if (prepend.IsEmpty()) {
216       return body;
217     } else {
218       return String::Concat(isolate, prepend, body);
219     }
220   } else {
221     CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
222     return MakeString(isolate, data, nread, Encoding());
223   }
224 }
225 
FlushData(Isolate * isolate)226 MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
227   if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
228     CHECK_EQ(MissingBytes(), 0);
229     CHECK_EQ(BufferedBytes(), 0);
230   }
231 
232   if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
233     // Ignore a single trailing byte, like the JS decoder does.
234     state_[kMissingBytes]--;
235     state_[kBufferedBytes]--;
236   }
237 
238   if (BufferedBytes() == 0)
239     return String::Empty(isolate);
240 
241   MaybeLocal<String> ret =
242       MakeString(isolate,
243                  IncompleteCharacterBuffer(),
244                  BufferedBytes(),
245                  Encoding());
246 
247   state_[kMissingBytes] = 0;
248   state_[kBufferedBytes] = 0;
249 
250   return ret;
251 }
252 
253 namespace {
254 
DecodeData(const FunctionCallbackInfo<Value> & args)255 void DecodeData(const FunctionCallbackInfo<Value>& args) {
256   StringDecoder* decoder =
257       reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
258   CHECK_NOT_NULL(decoder);
259 
260   CHECK(args[1]->IsArrayBufferView());
261   ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
262   size_t length = content.length();
263 
264   MaybeLocal<String> ret =
265       decoder->DecodeData(args.GetIsolate(), content.data(), &length);
266   if (!ret.IsEmpty())
267     args.GetReturnValue().Set(ret.ToLocalChecked());
268 }
269 
FlushData(const FunctionCallbackInfo<Value> & args)270 void FlushData(const FunctionCallbackInfo<Value>& args) {
271   StringDecoder* decoder =
272       reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
273   CHECK_NOT_NULL(decoder);
274   MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
275   if (!ret.IsEmpty())
276     args.GetReturnValue().Set(ret.ToLocalChecked());
277 }
278 
InitializeStringDecoder(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)279 void InitializeStringDecoder(Local<Object> target,
280                              Local<Value> unused,
281                              Local<Context> context,
282                              void* priv) {
283   Environment* env = Environment::GetCurrent(context);
284   Isolate* isolate = env->isolate();
285 
286 #define SET_DECODER_CONSTANT(name)                                            \
287   target->Set(context,                                                        \
288               FIXED_ONE_BYTE_STRING(isolate, #name),                          \
289               Integer::New(isolate, StringDecoder::name)).FromJust()
290 
291   SET_DECODER_CONSTANT(kIncompleteCharactersStart);
292   SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
293   SET_DECODER_CONSTANT(kMissingBytes);
294   SET_DECODER_CONSTANT(kBufferedBytes);
295   SET_DECODER_CONSTANT(kEncodingField);
296   SET_DECODER_CONSTANT(kNumFields);
297 
298   Local<Array> encodings = Array::New(isolate);
299 #define ADD_TO_ENCODINGS_ARRAY(cname, jsname)                                 \
300   encodings->Set(context,                                                     \
301                  static_cast<int32_t>(cname),                                 \
302                  FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
303   ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
304   ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
305   ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
306   ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
307   ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
308   ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
309   ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
310 
311   target->Set(context,
312               FIXED_ONE_BYTE_STRING(isolate, "encodings"),
313               encodings).Check();
314 
315   target->Set(context,
316               FIXED_ONE_BYTE_STRING(isolate, "kSize"),
317               Integer::New(isolate, sizeof(StringDecoder))).Check();
318 
319   env->SetMethod(target, "decode", DecodeData);
320   env->SetMethod(target, "flush", FlushData);
321 }
322 
323 }  // anonymous namespace
324 
325 }  // namespace node
326 
327 NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder,
328                                    node::InitializeStringDecoder)
329