1 #include "string_decoder.h" // NOLINT(build/include_inline)
2 #include "string_decoder-inl.h"
3
4 #include "env-inl.h"
5 #include "node_buffer.h"
6 #include "string_bytes.h"
7 #include "util.h"
8
9 using v8::Array;
10 using v8::ArrayBufferView;
11 using v8::Context;
12 using v8::FunctionCallbackInfo;
13 using v8::Integer;
14 using v8::Isolate;
15 using v8::Local;
16 using v8::MaybeLocal;
17 using v8::Object;
18 using v8::String;
19 using v8::Value;
20
21 namespace node {
22
23 namespace {
24
MakeString(Isolate * isolate,const char * data,size_t length,enum encoding encoding)25 MaybeLocal<String> MakeString(Isolate* isolate,
26 const char* data,
27 size_t length,
28 enum encoding encoding) {
29 Local<Value> error;
30 MaybeLocal<Value> ret;
31 if (encoding == UTF8) {
32 return String::NewFromUtf8(
33 isolate,
34 data,
35 v8::NewStringType::kNormal,
36 length);
37 } else {
38 ret = StringBytes::Encode(
39 isolate,
40 data,
41 length,
42 encoding,
43 &error);
44 }
45
46 if (ret.IsEmpty()) {
47 CHECK(!error.IsEmpty());
48 isolate->ThrowException(error);
49 }
50
51 DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
52 return ret.FromMaybe(Local<Value>()).As<String>();
53 }
54
55 } // anonymous namespace
56
57
DecodeData(Isolate * isolate,const char * data,size_t * nread_ptr)58 MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
59 const char* data,
60 size_t* nread_ptr) {
61 Local<String> prepend, body;
62
63 size_t nread = *nread_ptr;
64
65 if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) {
66 // See if we want bytes to finish a character from the previous
67 // chunk; if so, copy the new bytes to the missing bytes buffer
68 // and create a small string from it that is to be prepended to the
69 // main body.
70 if (MissingBytes() > 0) {
71 // There are never more bytes missing than the pre-calculated maximum.
72 CHECK_LE(MissingBytes() + BufferedBytes(),
73 kIncompleteCharactersEnd);
74 if (Encoding() == UTF8) {
75 // For UTF-8, we need special treatment to align with the V8 decoder:
76 // If an incomplete character is found at a chunk boundary, we use
77 // its remainder and pass it to V8 as-is.
78 for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
79 if ((data[i] & 0xC0) != 0x80) {
80 // This byte is not a continuation byte even though it should have
81 // been one. We stop decoding of the incomplete character at this
82 // point (but still use the rest of the incomplete bytes from this
83 // chunk) and assume that the new, unexpected byte starts a new one.
84 state_[kMissingBytes] = 0;
85 memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
86 state_[kBufferedBytes] += i;
87 data += i;
88 nread -= i;
89 break;
90 }
91 }
92 }
93
94 size_t found_bytes =
95 std::min(nread, static_cast<size_t>(MissingBytes()));
96 memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
97 data,
98 found_bytes);
99 // Adjust the two buffers.
100 data += found_bytes;
101 nread -= found_bytes;
102
103 state_[kMissingBytes] -= found_bytes;
104 state_[kBufferedBytes] += found_bytes;
105
106 if (LIKELY(MissingBytes() == 0)) {
107 // If no more bytes are missing, create a small string that we
108 // will later prepend.
109 if (!MakeString(isolate,
110 IncompleteCharacterBuffer(),
111 BufferedBytes(),
112 Encoding()).ToLocal(&prepend)) {
113 return MaybeLocal<String>();
114 }
115
116 *nread_ptr += BufferedBytes();
117 // No more buffered bytes.
118 state_[kBufferedBytes] = 0;
119 }
120 }
121
122 // It could be that trying to finish the previous chunk already
123 // consumed all data that we received in this chunk.
124 if (UNLIKELY(nread == 0)) {
125 body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
126 prepend = Local<String>();
127 } else {
128 // If not, that means is no character left to finish at this point.
129 DCHECK_EQ(MissingBytes(), 0);
130 DCHECK_EQ(BufferedBytes(), 0);
131
132 // See whether there is a character that we may have to cut off and
133 // finish when receiving the next chunk.
134 if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
135 // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
136 // This means we'll need to figure out where the character to which
137 // the byte belongs begins.
138 for (size_t i = nread - 1; ; --i) {
139 DCHECK_LT(i, nread);
140 state_[kBufferedBytes]++;
141 if ((data[i] & 0xC0) == 0x80) {
142 // This byte does not start a character (a "trailing" byte).
143 if (state_[kBufferedBytes] >= 4 || i == 0) {
144 // We either have more then 4 trailing bytes (which means
145 // the current character would not be inside the range for
146 // valid Unicode, and in particular cannot be represented
147 // through JavaScript's UTF-16-based approach to strings), or the
148 // current buffer does not contain the start of an UTF-8 character
149 // at all. Either way, this is invalid UTF8 and we can just
150 // let the engine's decoder handle it.
151 state_[kBufferedBytes] = 0;
152 break;
153 }
154 } else {
155 // Found the first byte of a UTF-8 character. By looking at the
156 // upper bits we can tell how long the character *should* be.
157 if ((data[i] & 0xE0) == 0xC0) {
158 state_[kMissingBytes] = 2;
159 } else if ((data[i] & 0xF0) == 0xE0) {
160 state_[kMissingBytes] = 3;
161 } else if ((data[i] & 0xF8) == 0xF0) {
162 state_[kMissingBytes] = 4;
163 } else {
164 // This lead byte would indicate a character outside of the
165 // representable range.
166 state_[kBufferedBytes] = 0;
167 break;
168 }
169
170 if (BufferedBytes() >= MissingBytes()) {
171 // Received more or exactly as many trailing bytes than the lead
172 // character would indicate. In the "==" case, we have valid
173 // data and don't need to slice anything off;
174 // in the ">" case, this is invalid UTF-8 anyway.
175 state_[kMissingBytes] = 0;
176 state_[kBufferedBytes] = 0;
177 }
178
179 state_[kMissingBytes] -= state_[kBufferedBytes];
180 break;
181 }
182 }
183 } else if (Encoding() == UCS2) {
184 if ((nread % 2) == 1) {
185 // We got half a codepoint, and need the second byte of it.
186 state_[kBufferedBytes] = 1;
187 state_[kMissingBytes] = 1;
188 } else if ((data[nread - 1] & 0xFC) == 0xD8) {
189 // Half a split UTF-16 character.
190 state_[kBufferedBytes] = 2;
191 state_[kMissingBytes] = 2;
192 }
193 } else if (Encoding() == BASE64) {
194 state_[kBufferedBytes] = nread % 3;
195 if (state_[kBufferedBytes] > 0)
196 state_[kMissingBytes] = 3 - BufferedBytes();
197 }
198
199 if (BufferedBytes() > 0) {
200 // Copy the requested number of buffered bytes from the end of the
201 // input into the incomplete character buffer.
202 nread -= BufferedBytes();
203 *nread_ptr -= BufferedBytes();
204 memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
205 }
206
207 if (nread > 0) {
208 if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
209 return MaybeLocal<String>();
210 } else {
211 body = String::Empty(isolate);
212 }
213 }
214
215 if (prepend.IsEmpty()) {
216 return body;
217 } else {
218 return String::Concat(isolate, prepend, body);
219 }
220 } else {
221 CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
222 return MakeString(isolate, data, nread, Encoding());
223 }
224 }
225
FlushData(Isolate * isolate)226 MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
227 if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
228 CHECK_EQ(MissingBytes(), 0);
229 CHECK_EQ(BufferedBytes(), 0);
230 }
231
232 if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
233 // Ignore a single trailing byte, like the JS decoder does.
234 state_[kMissingBytes]--;
235 state_[kBufferedBytes]--;
236 }
237
238 if (BufferedBytes() == 0)
239 return String::Empty(isolate);
240
241 MaybeLocal<String> ret =
242 MakeString(isolate,
243 IncompleteCharacterBuffer(),
244 BufferedBytes(),
245 Encoding());
246
247 state_[kMissingBytes] = 0;
248 state_[kBufferedBytes] = 0;
249
250 return ret;
251 }
252
253 namespace {
254
DecodeData(const FunctionCallbackInfo<Value> & args)255 void DecodeData(const FunctionCallbackInfo<Value>& args) {
256 StringDecoder* decoder =
257 reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
258 CHECK_NOT_NULL(decoder);
259
260 CHECK(args[1]->IsArrayBufferView());
261 ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
262 size_t length = content.length();
263
264 MaybeLocal<String> ret =
265 decoder->DecodeData(args.GetIsolate(), content.data(), &length);
266 if (!ret.IsEmpty())
267 args.GetReturnValue().Set(ret.ToLocalChecked());
268 }
269
FlushData(const FunctionCallbackInfo<Value> & args)270 void FlushData(const FunctionCallbackInfo<Value>& args) {
271 StringDecoder* decoder =
272 reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
273 CHECK_NOT_NULL(decoder);
274 MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
275 if (!ret.IsEmpty())
276 args.GetReturnValue().Set(ret.ToLocalChecked());
277 }
278
InitializeStringDecoder(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)279 void InitializeStringDecoder(Local<Object> target,
280 Local<Value> unused,
281 Local<Context> context,
282 void* priv) {
283 Environment* env = Environment::GetCurrent(context);
284 Isolate* isolate = env->isolate();
285
286 #define SET_DECODER_CONSTANT(name) \
287 target->Set(context, \
288 FIXED_ONE_BYTE_STRING(isolate, #name), \
289 Integer::New(isolate, StringDecoder::name)).FromJust()
290
291 SET_DECODER_CONSTANT(kIncompleteCharactersStart);
292 SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
293 SET_DECODER_CONSTANT(kMissingBytes);
294 SET_DECODER_CONSTANT(kBufferedBytes);
295 SET_DECODER_CONSTANT(kEncodingField);
296 SET_DECODER_CONSTANT(kNumFields);
297
298 Local<Array> encodings = Array::New(isolate);
299 #define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \
300 encodings->Set(context, \
301 static_cast<int32_t>(cname), \
302 FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
303 ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
304 ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
305 ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
306 ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
307 ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
308 ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
309 ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
310
311 target->Set(context,
312 FIXED_ONE_BYTE_STRING(isolate, "encodings"),
313 encodings).Check();
314
315 target->Set(context,
316 FIXED_ONE_BYTE_STRING(isolate, "kSize"),
317 Integer::New(isolate, sizeof(StringDecoder))).Check();
318
319 env->SetMethod(target, "decode", DecodeData);
320 env->SetMethod(target, "flush", FlushData);
321 }
322
323 } // anonymous namespace
324
325 } // namespace node
326
327 NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder,
328 node::InitializeStringDecoder)
329