1 #include "string_decoder.h" // NOLINT(build/include_inline)
2 #include "string_decoder-inl.h"
3
4 #include "env-inl.h"
5 #include "node_buffer.h"
6 #include "node_errors.h"
7 #include "node_external_reference.h"
8 #include "string_bytes.h"
9 #include "util.h"
10
11 using v8::Array;
12 using v8::ArrayBufferView;
13 using v8::Context;
14 using v8::FunctionCallbackInfo;
15 using v8::Integer;
16 using v8::Isolate;
17 using v8::Local;
18 using v8::MaybeLocal;
19 using v8::Object;
20 using v8::String;
21 using v8::Value;
22
23 namespace node {
24
25 namespace {
26
MakeString(Isolate * isolate,const char * data,size_t length,enum encoding encoding)27 MaybeLocal<String> MakeString(Isolate* isolate,
28 const char* data,
29 size_t length,
30 enum encoding encoding) {
31 Local<Value> error;
32 MaybeLocal<Value> ret;
33 if (encoding == UTF8) {
34 MaybeLocal<String> utf8_string = String::NewFromUtf8(
35 isolate,
36 data,
37 v8::NewStringType::kNormal,
38 length);
39 if (utf8_string.IsEmpty()) {
40 isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate));
41 return MaybeLocal<String>();
42 } else {
43 return utf8_string;
44 }
45 } else {
46 ret = StringBytes::Encode(
47 isolate,
48 data,
49 length,
50 encoding,
51 &error);
52 }
53
54 if (ret.IsEmpty()) {
55 CHECK(!error.IsEmpty());
56 isolate->ThrowException(error);
57 }
58
59 DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
60 return ret.FromMaybe(Local<Value>()).As<String>();
61 }
62
63 } // anonymous namespace
64
65
DecodeData(Isolate * isolate,const char * data,size_t * nread_ptr)66 MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
67 const char* data,
68 size_t* nread_ptr) {
69 Local<String> prepend, body;
70
71 size_t nread = *nread_ptr;
72
73 if (Encoding() == UTF8 ||
74 Encoding() == UCS2 ||
75 Encoding() == BASE64 ||
76 Encoding() == BASE64URL) {
77 // See if we want bytes to finish a character from the previous
78 // chunk; if so, copy the new bytes to the missing bytes buffer
79 // and create a small string from it that is to be prepended to the
80 // main body.
81 if (MissingBytes() > 0) {
82 // There are never more bytes missing than the pre-calculated maximum.
83 CHECK_LE(MissingBytes() + BufferedBytes(),
84 kIncompleteCharactersEnd);
85 if (Encoding() == UTF8) {
86 // For UTF-8, we need special treatment to align with the V8 decoder:
87 // If an incomplete character is found at a chunk boundary, we use
88 // its remainder and pass it to V8 as-is.
89 for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
90 if ((data[i] & 0xC0) != 0x80) {
91 // This byte is not a continuation byte even though it should have
92 // been one. We stop decoding of the incomplete character at this
93 // point (but still use the rest of the incomplete bytes from this
94 // chunk) and assume that the new, unexpected byte starts a new one.
95 state_[kMissingBytes] = 0;
96 memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
97 state_[kBufferedBytes] += i;
98 data += i;
99 nread -= i;
100 break;
101 }
102 }
103 }
104
105 size_t found_bytes =
106 std::min(nread, static_cast<size_t>(MissingBytes()));
107 memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
108 data,
109 found_bytes);
110 // Adjust the two buffers.
111 data += found_bytes;
112 nread -= found_bytes;
113
114 state_[kMissingBytes] -= found_bytes;
115 state_[kBufferedBytes] += found_bytes;
116
117 if (LIKELY(MissingBytes() == 0)) {
118 // If no more bytes are missing, create a small string that we
119 // will later prepend.
120 if (!MakeString(isolate,
121 IncompleteCharacterBuffer(),
122 BufferedBytes(),
123 Encoding()).ToLocal(&prepend)) {
124 return MaybeLocal<String>();
125 }
126
127 *nread_ptr += BufferedBytes();
128 // No more buffered bytes.
129 state_[kBufferedBytes] = 0;
130 }
131 }
132
133 // It could be that trying to finish the previous chunk already
134 // consumed all data that we received in this chunk.
135 if (UNLIKELY(nread == 0)) {
136 body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
137 prepend = Local<String>();
138 } else {
139 // If not, that means is no character left to finish at this point.
140 DCHECK_EQ(MissingBytes(), 0);
141 DCHECK_EQ(BufferedBytes(), 0);
142
143 // See whether there is a character that we may have to cut off and
144 // finish when receiving the next chunk.
145 if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
146 // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
147 // This means we'll need to figure out where the character to which
148 // the byte belongs begins.
149 for (size_t i = nread - 1; ; --i) {
150 DCHECK_LT(i, nread);
151 state_[kBufferedBytes]++;
152 if ((data[i] & 0xC0) == 0x80) {
153 // This byte does not start a character (a "trailing" byte).
154 if (state_[kBufferedBytes] >= 4 || i == 0) {
155 // We either have more then 4 trailing bytes (which means
156 // the current character would not be inside the range for
157 // valid Unicode, and in particular cannot be represented
158 // through JavaScript's UTF-16-based approach to strings), or the
159 // current buffer does not contain the start of an UTF-8 character
160 // at all. Either way, this is invalid UTF8 and we can just
161 // let the engine's decoder handle it.
162 state_[kBufferedBytes] = 0;
163 break;
164 }
165 } else {
166 // Found the first byte of a UTF-8 character. By looking at the
167 // upper bits we can tell how long the character *should* be.
168 if ((data[i] & 0xE0) == 0xC0) {
169 state_[kMissingBytes] = 2;
170 } else if ((data[i] & 0xF0) == 0xE0) {
171 state_[kMissingBytes] = 3;
172 } else if ((data[i] & 0xF8) == 0xF0) {
173 state_[kMissingBytes] = 4;
174 } else {
175 // This lead byte would indicate a character outside of the
176 // representable range.
177 state_[kBufferedBytes] = 0;
178 break;
179 }
180
181 if (BufferedBytes() >= MissingBytes()) {
182 // Received more or exactly as many trailing bytes than the lead
183 // character would indicate. In the "==" case, we have valid
184 // data and don't need to slice anything off;
185 // in the ">" case, this is invalid UTF-8 anyway.
186 state_[kMissingBytes] = 0;
187 state_[kBufferedBytes] = 0;
188 }
189
190 state_[kMissingBytes] -= state_[kBufferedBytes];
191 break;
192 }
193 }
194 } else if (Encoding() == UCS2) {
195 if ((nread % 2) == 1) {
196 // We got half a codepoint, and need the second byte of it.
197 state_[kBufferedBytes] = 1;
198 state_[kMissingBytes] = 1;
199 } else if ((data[nread - 1] & 0xFC) == 0xD8) {
200 // Half a split UTF-16 character.
201 state_[kBufferedBytes] = 2;
202 state_[kMissingBytes] = 2;
203 }
204 } else if (Encoding() == BASE64 || Encoding() == BASE64URL) {
205 state_[kBufferedBytes] = nread % 3;
206 if (state_[kBufferedBytes] > 0)
207 state_[kMissingBytes] = 3 - BufferedBytes();
208 }
209
210 if (BufferedBytes() > 0) {
211 // Copy the requested number of buffered bytes from the end of the
212 // input into the incomplete character buffer.
213 nread -= BufferedBytes();
214 *nread_ptr -= BufferedBytes();
215 memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
216 }
217
218 if (nread > 0) {
219 if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
220 return MaybeLocal<String>();
221 } else {
222 body = String::Empty(isolate);
223 }
224 }
225
226 if (prepend.IsEmpty()) {
227 return body;
228 } else {
229 return String::Concat(isolate, prepend, body);
230 }
231 } else {
232 CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
233 return MakeString(isolate, data, nread, Encoding());
234 }
235 }
236
FlushData(Isolate * isolate)237 MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
238 if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
239 CHECK_EQ(MissingBytes(), 0);
240 CHECK_EQ(BufferedBytes(), 0);
241 }
242
243 if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
244 // Ignore a single trailing byte, like the JS decoder does.
245 state_[kMissingBytes]--;
246 state_[kBufferedBytes]--;
247 }
248
249 if (BufferedBytes() == 0)
250 return String::Empty(isolate);
251
252 MaybeLocal<String> ret =
253 MakeString(isolate,
254 IncompleteCharacterBuffer(),
255 BufferedBytes(),
256 Encoding());
257
258 state_[kMissingBytes] = 0;
259 state_[kBufferedBytes] = 0;
260
261 return ret;
262 }
263
264 namespace {
265
DecodeData(const FunctionCallbackInfo<Value> & args)266 void DecodeData(const FunctionCallbackInfo<Value>& args) {
267 StringDecoder* decoder =
268 reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
269 CHECK_NOT_NULL(decoder);
270
271 CHECK(args[1]->IsArrayBufferView());
272 ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
273 size_t length = content.length();
274
275 MaybeLocal<String> ret =
276 decoder->DecodeData(args.GetIsolate(), content.data(), &length);
277 if (!ret.IsEmpty())
278 args.GetReturnValue().Set(ret.ToLocalChecked());
279 }
280
FlushData(const FunctionCallbackInfo<Value> & args)281 void FlushData(const FunctionCallbackInfo<Value>& args) {
282 StringDecoder* decoder =
283 reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
284 CHECK_NOT_NULL(decoder);
285 MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
286 if (!ret.IsEmpty())
287 args.GetReturnValue().Set(ret.ToLocalChecked());
288 }
289
InitializeStringDecoder(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)290 void InitializeStringDecoder(Local<Object> target,
291 Local<Value> unused,
292 Local<Context> context,
293 void* priv) {
294 Environment* env = Environment::GetCurrent(context);
295 Isolate* isolate = env->isolate();
296
297 #define SET_DECODER_CONSTANT(name) \
298 target->Set(context, \
299 FIXED_ONE_BYTE_STRING(isolate, #name), \
300 Integer::New(isolate, StringDecoder::name)).FromJust()
301
302 SET_DECODER_CONSTANT(kIncompleteCharactersStart);
303 SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
304 SET_DECODER_CONSTANT(kMissingBytes);
305 SET_DECODER_CONSTANT(kBufferedBytes);
306 SET_DECODER_CONSTANT(kEncodingField);
307 SET_DECODER_CONSTANT(kNumFields);
308
309 Local<Array> encodings = Array::New(isolate);
310 #define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \
311 encodings->Set(context, \
312 static_cast<int32_t>(cname), \
313 FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
314 ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
315 ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
316 ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
317 ADD_TO_ENCODINGS_ARRAY(BASE64URL, "base64url");
318 ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
319 ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
320 ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
321 ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
322
323 target->Set(context,
324 FIXED_ONE_BYTE_STRING(isolate, "encodings"),
325 encodings).Check();
326
327 target->Set(context,
328 FIXED_ONE_BYTE_STRING(isolate, "kSize"),
329 Integer::New(isolate, sizeof(StringDecoder))).Check();
330
331 SetMethod(context, target, "decode", DecodeData);
332 SetMethod(context, target, "flush", FlushData);
333 }
334
335 } // anonymous namespace
336
RegisterStringDecoderExternalReferences(ExternalReferenceRegistry * registry)337 void RegisterStringDecoderExternalReferences(
338 ExternalReferenceRegistry* registry) {
339 registry->Register(DecodeData);
340 registry->Register(FlushData);
341 }
342
343 } // namespace node
344
345 NODE_BINDING_CONTEXT_AWARE_INTERNAL(string_decoder,
346 node::InitializeStringDecoder)
347 NODE_BINDING_EXTERNAL_REFERENCE(string_decoder,
348 node::RegisterStringDecoderExternalReferences)
349