1 #include "node_url.h"
2 #include "base_object-inl.h"
3 #include "node_errors.h"
4 #include "node_i18n.h"
5 #include "util-inl.h"
6
7 #include <cmath>
8 #include <cstdio>
9 #include <string>
10 #include <vector>
11
12 namespace node {
13
14 using errors::TryCatchScope;
15
16 using url::table_data::hex;
17 using url::table_data::C0_CONTROL_ENCODE_SET;
18 using url::table_data::FRAGMENT_ENCODE_SET;
19 using url::table_data::PATH_ENCODE_SET;
20 using url::table_data::USERINFO_ENCODE_SET;
21 using url::table_data::QUERY_ENCODE_SET_NONSPECIAL;
22 using url::table_data::QUERY_ENCODE_SET_SPECIAL;
23
24 using v8::Array;
25 using v8::Context;
26 using v8::Function;
27 using v8::FunctionCallbackInfo;
28 using v8::HandleScope;
29 using v8::Int32;
30 using v8::Integer;
31 using v8::Isolate;
32 using v8::Local;
33 using v8::MaybeLocal;
34 using v8::NewStringType;
35 using v8::Null;
36 using v8::Object;
37 using v8::String;
38 using v8::Undefined;
39 using v8::Value;
40
Utf8String(Isolate * isolate,const std::string & str)41 Local<String> Utf8String(Isolate* isolate, const std::string& str) {
42 return String::NewFromUtf8(isolate,
43 str.data(),
44 NewStringType::kNormal,
45 str.length()).ToLocalChecked();
46 }
47
48 namespace url {
49 namespace {
50
51 // https://url.spec.whatwg.org/#eof-code-point
52 constexpr char kEOL = -1;
53
54 // Used in ToUSVString().
55 constexpr char16_t kUnicodeReplacementCharacter = 0xFFFD;
56
57 // https://url.spec.whatwg.org/#concept-host
58 class URLHost {
59 public:
60 ~URLHost();
61
62 void ParseIPv4Host(const char* input, size_t length, bool* is_ipv4);
63 void ParseIPv6Host(const char* input, size_t length);
64 void ParseOpaqueHost(const char* input, size_t length);
65 void ParseHost(const char* input,
66 size_t length,
67 bool is_special,
68 bool unicode = false);
69
ParsingFailed() const70 bool ParsingFailed() const { return type_ == HostType::H_FAILED; }
71 std::string ToString() const;
72 // Like ToString(), but avoids a copy in exchange for invalidating `*this`.
73 std::string ToStringMove();
74
75 private:
76 enum class HostType {
77 H_FAILED,
78 H_DOMAIN,
79 H_IPV4,
80 H_IPV6,
81 H_OPAQUE,
82 };
83
84 union Value {
85 std::string domain_or_opaque;
86 uint32_t ipv4;
87 uint16_t ipv6[8];
88
~Value()89 ~Value() {}
Value()90 Value() : ipv4(0) {}
91 };
92
93 Value value_;
94 HostType type_ = HostType::H_FAILED;
95
Reset()96 void Reset() {
97 using string = std::string;
98 switch (type_) {
99 case HostType::H_DOMAIN:
100 case HostType::H_OPAQUE:
101 value_.domain_or_opaque.~string();
102 break;
103 default:
104 break;
105 }
106 type_ = HostType::H_FAILED;
107 }
108
109 // Setting the string members of the union with = is brittle because
110 // it relies on them being initialized to a state that requires no
111 // destruction of old data.
112 // For a long time, that worked well enough because ParseIPv6Host() happens
113 // to zero-fill `value_`, but that really is relying on standard library
114 // internals too much.
115 // These helpers are the easiest solution but we might want to consider
116 // just not forcing strings into an union.
SetOpaque(std::string && string)117 void SetOpaque(std::string&& string) {
118 Reset();
119 type_ = HostType::H_OPAQUE;
120 new(&value_.domain_or_opaque) std::string(std::move(string));
121 }
122
SetDomain(std::string && string)123 void SetDomain(std::string&& string) {
124 Reset();
125 type_ = HostType::H_DOMAIN;
126 new(&value_.domain_or_opaque) std::string(std::move(string));
127 }
128 };
129
~URLHost()130 URLHost::~URLHost() {
131 Reset();
132 }
133
134 #define ARGS(XX) \
135 XX(ARG_FLAGS) \
136 XX(ARG_PROTOCOL) \
137 XX(ARG_USERNAME) \
138 XX(ARG_PASSWORD) \
139 XX(ARG_HOST) \
140 XX(ARG_PORT) \
141 XX(ARG_PATH) \
142 XX(ARG_QUERY) \
143 XX(ARG_FRAGMENT) \
144 XX(ARG_COUNT) // This one has to be last.
145
146 #define ERR_ARGS(XX) \
147 XX(ERR_ARG_FLAGS) \
148 XX(ERR_ARG_INPUT) \
149
150 enum url_cb_args {
151 #define XX(name) name,
152 ARGS(XX)
153 #undef XX
154 };
155
156 enum url_error_cb_args {
157 #define XX(name) name,
158 ERR_ARGS(XX)
159 #undef XX
160 };
161
162 #define CHAR_TEST(bits, name, expr) \
163 template <typename T> \
164 bool name(const T ch) { \
165 static_assert(sizeof(ch) >= (bits) / 8, \
166 "Character must be wider than " #bits " bits"); \
167 return (expr); \
168 }
169
170 #define TWO_CHAR_STRING_TEST(bits, name, expr) \
171 template <typename T> \
172 bool name(const T ch1, const T ch2) { \
173 static_assert(sizeof(ch1) >= (bits) / 8, \
174 "Character must be wider than " #bits " bits"); \
175 return (expr); \
176 } \
177 template <typename T> \
178 bool name(const std::basic_string<T>& str) { \
179 static_assert(sizeof(str[0]) >= (bits) / 8, \
180 "Character must be wider than " #bits " bits"); \
181 return str.length() >= 2 && name(str[0], str[1]); \
182 }
183
184 // https://infra.spec.whatwg.org/#ascii-tab-or-newline
185 CHAR_TEST(8, IsASCIITabOrNewline, (ch == '\t' || ch == '\n' || ch == '\r'))
186
187 // https://infra.spec.whatwg.org/#c0-control-or-space
188 CHAR_TEST(8, IsC0ControlOrSpace, (ch >= '\0' && ch <= ' '))
189
190 // https://infra.spec.whatwg.org/#ascii-digit
191 CHAR_TEST(8, IsASCIIDigit, (ch >= '0' && ch <= '9'))
192
193 // https://infra.spec.whatwg.org/#ascii-hex-digit
194 CHAR_TEST(8, IsASCIIHexDigit, (IsASCIIDigit(ch) ||
195 (ch >= 'A' && ch <= 'F') ||
196 (ch >= 'a' && ch <= 'f')))
197
198 // https://infra.spec.whatwg.org/#ascii-alpha
199 CHAR_TEST(8, IsASCIIAlpha, ((ch >= 'A' && ch <= 'Z') ||
200 (ch >= 'a' && ch <= 'z')))
201
202 // https://infra.spec.whatwg.org/#ascii-alphanumeric
203 CHAR_TEST(8, IsASCIIAlphanumeric, (IsASCIIDigit(ch) || IsASCIIAlpha(ch)))
204
205 // https://infra.spec.whatwg.org/#ascii-lowercase
206 template <typename T>
ASCIILowercase(T ch)207 T ASCIILowercase(T ch) {
208 return IsASCIIAlpha(ch) ? (ch | 0x20) : ch;
209 }
210
211 // https://url.spec.whatwg.org/#forbidden-host-code-point
212 CHAR_TEST(8, IsForbiddenHostCodePoint,
213 ch == '\0' || ch == '\t' || ch == '\n' || ch == '\r' ||
214 ch == ' ' || ch == '#' || ch == '%' || ch == '/' ||
215 ch == ':' || ch == '?' || ch == '@' || ch == '[' ||
216 ch == '\\' || ch == ']')
217
218 // https://url.spec.whatwg.org/#windows-drive-letter
219 TWO_CHAR_STRING_TEST(8, IsWindowsDriveLetter,
220 (IsASCIIAlpha(ch1) && (ch2 == ':' || ch2 == '|')))
221
222 // https://url.spec.whatwg.org/#normalized-windows-drive-letter
223 TWO_CHAR_STRING_TEST(8, IsNormalizedWindowsDriveLetter,
224 (IsASCIIAlpha(ch1) && ch2 == ':'))
225
226 // If a UTF-16 character is a low/trailing surrogate.
227 CHAR_TEST(16, IsUnicodeTrail, (ch & 0xFC00) == 0xDC00)
228
229 // If a UTF-16 character is a surrogate.
230 CHAR_TEST(16, IsUnicodeSurrogate, (ch & 0xF800) == 0xD800)
231
232 // If a UTF-16 surrogate is a low/trailing one.
233 CHAR_TEST(16, IsUnicodeSurrogateTrail, (ch & 0x400) != 0)
234
235 #undef CHAR_TEST
236 #undef TWO_CHAR_STRING_TEST
237
238
BitAt(const uint8_t a[],const uint8_t i)239 bool BitAt(const uint8_t a[], const uint8_t i) {
240 return !!(a[i >> 3] & (1 << (i & 7)));
241 }
242
243 // Appends ch to str. If ch position in encode_set is set, the ch will
244 // be percent-encoded then appended.
AppendOrEscape(std::string * str,const unsigned char ch,const uint8_t encode_set[])245 void AppendOrEscape(std::string* str,
246 const unsigned char ch,
247 const uint8_t encode_set[]) {
248 if (BitAt(encode_set, ch))
249 *str += hex + ch * 4; // "%XX\0" has a length of 4
250 else
251 *str += ch;
252 }
253
254 template <typename T>
hex2bin(const T ch)255 unsigned hex2bin(const T ch) {
256 if (ch >= '0' && ch <= '9')
257 return ch - '0';
258 if (ch >= 'A' && ch <= 'F')
259 return 10 + (ch - 'A');
260 if (ch >= 'a' && ch <= 'f')
261 return 10 + (ch - 'a');
262 return static_cast<unsigned>(-1);
263 }
264
PercentDecode(const char * input,size_t len)265 std::string PercentDecode(const char* input, size_t len) {
266 std::string dest;
267 if (len == 0)
268 return dest;
269 dest.reserve(len);
270 const char* pointer = input;
271 const char* end = input + len;
272
273 while (pointer < end) {
274 const char ch = pointer[0];
275 size_t remaining = end - pointer - 1;
276 if (ch != '%' || remaining < 2 ||
277 (ch == '%' &&
278 (!IsASCIIHexDigit(pointer[1]) ||
279 !IsASCIIHexDigit(pointer[2])))) {
280 dest += ch;
281 pointer++;
282 continue;
283 } else {
284 unsigned a = hex2bin(pointer[1]);
285 unsigned b = hex2bin(pointer[2]);
286 char c = static_cast<char>(a * 16 + b);
287 dest += c;
288 pointer += 3;
289 }
290 }
291 return dest;
292 }
293
294 #define SPECIALS(XX) \
295 XX(ftp, 21, "ftp:") \
296 XX(file, -1, "file:") \
297 XX(gopher, 70, "gopher:") \
298 XX(http, 80, "http:") \
299 XX(https, 443, "https:") \
300 XX(ws, 80, "ws:") \
301 XX(wss, 443, "wss:")
302
IsSpecial(const std::string & scheme)303 bool IsSpecial(const std::string& scheme) {
304 #define V(_, __, name) if (scheme == name) return true;
305 SPECIALS(V);
306 #undef V
307 return false;
308 }
309
GetSpecial(Environment * env,const std::string & scheme)310 Local<String> GetSpecial(Environment* env, const std::string& scheme) {
311 #define V(key, _, name) if (scheme == name) \
312 return env->url_special_##key##_string();
313 SPECIALS(V)
314 #undef V
315 UNREACHABLE();
316 }
317
NormalizePort(const std::string & scheme,int p)318 int NormalizePort(const std::string& scheme, int p) {
319 #define V(_, port, name) if (scheme == name && p == port) return -1;
320 SPECIALS(V);
321 #undef V
322 return p;
323 }
324
325 // https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
StartsWithWindowsDriveLetter(const char * p,const char * end)326 bool StartsWithWindowsDriveLetter(const char* p, const char* end) {
327 size_t length = end - p;
328 return length >= 2 &&
329 IsWindowsDriveLetter(p[0], p[1]) &&
330 (length == 2 ||
331 p[2] == '/' ||
332 p[2] == '\\' ||
333 p[2] == '?' ||
334 p[2] == '#');
335 }
336
337 #if defined(NODE_HAVE_I18N_SUPPORT)
ToUnicode(const std::string & input,std::string * output)338 bool ToUnicode(const std::string& input, std::string* output) {
339 MaybeStackBuffer<char> buf;
340 if (i18n::ToUnicode(&buf, input.c_str(), input.length()) < 0)
341 return false;
342 output->assign(*buf, buf.length());
343 return true;
344 }
345
ToASCII(const std::string & input,std::string * output)346 bool ToASCII(const std::string& input, std::string* output) {
347 MaybeStackBuffer<char> buf;
348 if (i18n::ToASCII(&buf, input.c_str(), input.length()) < 0)
349 return false;
350 output->assign(*buf, buf.length());
351 return true;
352 }
353 #else
354 // Intentional non-ops if ICU is not present.
ToUnicode(const std::string & input,std::string * output)355 bool ToUnicode(const std::string& input, std::string* output) {
356 *output = input;
357 return true;
358 }
359
ToASCII(const std::string & input,std::string * output)360 bool ToASCII(const std::string& input, std::string* output) {
361 *output = input;
362 return true;
363 }
364 #endif
365
366 #define NS_IN6ADDRSZ 16
367
ParseIPv6Host(const char * input,size_t length)368 void URLHost::ParseIPv6Host(const char* input, size_t length) {
369 CHECK_EQ(type_, HostType::H_FAILED);
370
371 unsigned char buf[sizeof(struct in6_addr)];
372 MaybeStackBuffer<char> ipv6(length + 1);
373 *(*ipv6 + length) = 0;
374 memset(buf, 0, sizeof(buf));
375 memcpy(*ipv6, input, sizeof(const char) * length);
376
377 int ret = uv_inet_pton(AF_INET6, *ipv6, buf);
378
379 if (ret != 0) {
380 return;
381 }
382
383 // Ref: https://sourceware.org/git/?p=glibc.git;a=blob;f=resolv/inet_ntop.c;h=c4d38c0f951013e51a4fc6eaa8a9b82e146abe5a;hb=HEAD#l119
384 for (int i = 0; i < NS_IN6ADDRSZ; i += 2) {
385 value_.ipv6[i >> 1] = (buf[i] << 8) | buf[i + 1];
386 }
387
388 type_ = HostType::H_IPV6;
389 }
390
ParseNumber(const char * start,const char * end)391 int64_t ParseNumber(const char* start, const char* end) {
392 unsigned R = 10;
393 if (end - start >= 2 && start[0] == '0' && (start[1] | 0x20) == 'x') {
394 start += 2;
395 R = 16;
396 }
397 if (end - start == 0) {
398 return 0;
399 } else if (R == 10 && end - start > 1 && start[0] == '0') {
400 start++;
401 R = 8;
402 }
403 const char* p = start;
404
405 while (p < end) {
406 const char ch = p[0];
407 switch (R) {
408 case 8:
409 if (ch < '0' || ch > '7')
410 return -1;
411 break;
412 case 10:
413 if (!IsASCIIDigit(ch))
414 return -1;
415 break;
416 case 16:
417 if (!IsASCIIHexDigit(ch))
418 return -1;
419 break;
420 }
421 p++;
422 }
423 return strtoll(start, nullptr, R);
424 }
425
ParseIPv4Host(const char * input,size_t length,bool * is_ipv4)426 void URLHost::ParseIPv4Host(const char* input, size_t length, bool* is_ipv4) {
427 CHECK_EQ(type_, HostType::H_FAILED);
428 *is_ipv4 = false;
429 const char* pointer = input;
430 const char* mark = input;
431 const char* end = pointer + length;
432 int parts = 0;
433 uint32_t val = 0;
434 uint64_t numbers[4];
435 int tooBigNumbers = 0;
436 if (length == 0)
437 return;
438
439 while (pointer <= end) {
440 const char ch = pointer < end ? pointer[0] : kEOL;
441 int64_t remaining = end - pointer - 1;
442 if (ch == '.' || ch == kEOL) {
443 if (++parts > static_cast<int>(arraysize(numbers)))
444 return;
445 if (pointer == mark)
446 return;
447 int64_t n = ParseNumber(mark, pointer);
448 if (n < 0)
449 return;
450
451 if (n > 255) {
452 tooBigNumbers++;
453 }
454 numbers[parts - 1] = n;
455 mark = pointer + 1;
456 if (ch == '.' && remaining == 0)
457 break;
458 }
459 pointer++;
460 }
461 CHECK_GT(parts, 0);
462 *is_ipv4 = true;
463
464 // If any but the last item in numbers is greater than 255, return failure.
465 // If the last item in numbers is greater than or equal to
466 // 256^(5 - the number of items in numbers), return failure.
467 if (tooBigNumbers > 1 ||
468 (tooBigNumbers == 1 && numbers[parts - 1] <= 255) ||
469 numbers[parts - 1] >= pow(256, static_cast<double>(5 - parts))) {
470 return;
471 }
472
473 type_ = HostType::H_IPV4;
474 val = static_cast<uint32_t>(numbers[parts - 1]);
475 for (int n = 0; n < parts - 1; n++) {
476 double b = 3 - n;
477 val +=
478 static_cast<uint32_t>(numbers[n]) * static_cast<uint32_t>(pow(256, b));
479 }
480
481 value_.ipv4 = val;
482 }
483
ParseOpaqueHost(const char * input,size_t length)484 void URLHost::ParseOpaqueHost(const char* input, size_t length) {
485 CHECK_EQ(type_, HostType::H_FAILED);
486 std::string output;
487 output.reserve(length);
488 for (size_t i = 0; i < length; i++) {
489 const char ch = input[i];
490 if (ch != '%' && IsForbiddenHostCodePoint(ch)) {
491 return;
492 } else {
493 AppendOrEscape(&output, ch, C0_CONTROL_ENCODE_SET);
494 }
495 }
496
497 SetOpaque(std::move(output));
498 }
499
ParseHost(const char * input,size_t length,bool is_special,bool unicode)500 void URLHost::ParseHost(const char* input,
501 size_t length,
502 bool is_special,
503 bool unicode) {
504 CHECK_EQ(type_, HostType::H_FAILED);
505 const char* pointer = input;
506
507 if (length == 0)
508 return;
509
510 if (pointer[0] == '[') {
511 if (pointer[length - 1] != ']')
512 return;
513 return ParseIPv6Host(++pointer, length - 2);
514 }
515
516 if (!is_special)
517 return ParseOpaqueHost(input, length);
518
519 // First, we have to percent decode
520 std::string decoded = PercentDecode(input, length);
521
522 // Then we have to punycode toASCII
523 if (!ToASCII(decoded, &decoded))
524 return;
525
526 // If any of the following characters are still present, we have to fail
527 for (size_t n = 0; n < decoded.size(); n++) {
528 const char ch = decoded[n];
529 if (IsForbiddenHostCodePoint(ch)) {
530 return;
531 }
532 }
533
534 // Check to see if it's an IPv4 IP address
535 bool is_ipv4;
536 ParseIPv4Host(decoded.c_str(), decoded.length(), &is_ipv4);
537 if (is_ipv4)
538 return;
539
540 // If the unicode flag is set, run the result through punycode ToUnicode
541 if (unicode && !ToUnicode(decoded, &decoded))
542 return;
543
544 // It's not an IPv4 or IPv6 address, it must be a domain
545 SetDomain(std::move(decoded));
546 }
547
548 // Locates the longest sequence of 0 segments in an IPv6 address
549 // in order to use the :: compression when serializing
550 template <typename T>
FindLongestZeroSequence(T * values,size_t len)551 T* FindLongestZeroSequence(T* values, size_t len) {
552 T* start = values;
553 T* end = start + len;
554 T* result = nullptr;
555
556 T* current = nullptr;
557 unsigned counter = 0, longest = 1;
558
559 while (start < end) {
560 if (*start == 0) {
561 if (current == nullptr)
562 current = start;
563 counter++;
564 } else {
565 if (counter > longest) {
566 longest = counter;
567 result = current;
568 }
569 counter = 0;
570 current = nullptr;
571 }
572 start++;
573 }
574 if (counter > longest)
575 result = current;
576 return result;
577 }
578
ToStringMove()579 std::string URLHost::ToStringMove() {
580 std::string return_value;
581 switch (type_) {
582 case HostType::H_DOMAIN:
583 case HostType::H_OPAQUE:
584 return_value = std::move(value_.domain_or_opaque);
585 break;
586 default:
587 return_value = ToString();
588 break;
589 }
590 Reset();
591 return return_value;
592 }
593
ToString() const594 std::string URLHost::ToString() const {
595 std::string dest;
596 switch (type_) {
597 case HostType::H_DOMAIN:
598 case HostType::H_OPAQUE:
599 return value_.domain_or_opaque;
600 break;
601 case HostType::H_IPV4: {
602 dest.reserve(15);
603 uint32_t value = value_.ipv4;
604 for (int n = 0; n < 4; n++) {
605 char buf[4];
606 snprintf(buf, sizeof(buf), "%d", value % 256);
607 dest.insert(0, buf);
608 if (n < 3)
609 dest.insert(0, 1, '.');
610 value /= 256;
611 }
612 break;
613 }
614 case HostType::H_IPV6: {
615 dest.reserve(41);
616 dest += '[';
617 const uint16_t* start = &value_.ipv6[0];
618 const uint16_t* compress_pointer =
619 FindLongestZeroSequence(start, 8);
620 bool ignore0 = false;
621 for (int n = 0; n <= 7; n++) {
622 const uint16_t* piece = &value_.ipv6[n];
623 if (ignore0 && *piece == 0)
624 continue;
625 else if (ignore0)
626 ignore0 = false;
627 if (compress_pointer == piece) {
628 dest += n == 0 ? "::" : ":";
629 ignore0 = true;
630 continue;
631 }
632 char buf[5];
633 snprintf(buf, sizeof(buf), "%x", *piece);
634 dest += buf;
635 if (n < 7)
636 dest += ':';
637 }
638 dest += ']';
639 break;
640 }
641 case HostType::H_FAILED:
642 break;
643 }
644 return dest;
645 }
646
ParseHost(const std::string & input,std::string * output,bool is_special,bool unicode=false)647 bool ParseHost(const std::string& input,
648 std::string* output,
649 bool is_special,
650 bool unicode = false) {
651 if (input.empty()) {
652 output->clear();
653 return true;
654 }
655 URLHost host;
656 host.ParseHost(input.c_str(), input.length(), is_special, unicode);
657 if (host.ParsingFailed())
658 return false;
659 *output = host.ToStringMove();
660 return true;
661 }
662
FromJSStringArray(Environment * env,Local<Array> array)663 std::vector<std::string> FromJSStringArray(Environment* env,
664 Local<Array> array) {
665 std::vector<std::string> vec;
666 if (array->Length() > 0)
667 vec.reserve(array->Length());
668 for (size_t n = 0; n < array->Length(); n++) {
669 Local<Value> val = array->Get(env->context(), n).ToLocalChecked();
670 if (val->IsString()) {
671 Utf8Value value(env->isolate(), val.As<String>());
672 vec.emplace_back(*value, value.length());
673 }
674 }
675 return vec;
676 }
677
HarvestBase(Environment * env,Local<Object> base_obj)678 url_data HarvestBase(Environment* env, Local<Object> base_obj) {
679 url_data base;
680 Local<Context> context = env->context();
681
682 Local<Value> flags =
683 base_obj->Get(env->context(), env->flags_string()).ToLocalChecked();
684 if (flags->IsInt32())
685 base.flags = flags->Int32Value(context).FromJust();
686
687 Local<Value> port =
688 base_obj->Get(env->context(), env->port_string()).ToLocalChecked();
689 if (port->IsInt32())
690 base.port = port->Int32Value(context).FromJust();
691
692 Local<Value> scheme =
693 base_obj->Get(env->context(), env->scheme_string()).ToLocalChecked();
694 base.scheme = Utf8Value(env->isolate(), scheme).out();
695
696 auto GetStr = [&](std::string url_data::*member,
697 int flag,
698 Local<String> name,
699 bool empty_as_present) {
700 Local<Value> value = base_obj->Get(env->context(), name).ToLocalChecked();
701 if (value->IsString()) {
702 Utf8Value utf8value(env->isolate(), value.As<String>());
703 (base.*member).assign(*utf8value, utf8value.length());
704 if (empty_as_present || value.As<String>()->Length() != 0) {
705 base.flags |= flag;
706 }
707 }
708 };
709 GetStr(&url_data::username,
710 URL_FLAGS_HAS_USERNAME,
711 env->username_string(),
712 false);
713 GetStr(&url_data::password,
714 URL_FLAGS_HAS_PASSWORD,
715 env->password_string(),
716 false);
717 GetStr(&url_data::host, URL_FLAGS_HAS_HOST, env->host_string(), true);
718 GetStr(&url_data::query, URL_FLAGS_HAS_QUERY, env->query_string(), true);
719 GetStr(&url_data::fragment,
720 URL_FLAGS_HAS_FRAGMENT,
721 env->fragment_string(),
722 true);
723
724 Local<Value>
725 path = base_obj->Get(env->context(), env->path_string()).ToLocalChecked();
726 if (path->IsArray()) {
727 base.flags |= URL_FLAGS_HAS_PATH;
728 base.path = FromJSStringArray(env, path.As<Array>());
729 }
730 return base;
731 }
732
HarvestContext(Environment * env,Local<Object> context_obj)733 url_data HarvestContext(Environment* env, Local<Object> context_obj) {
734 url_data context;
735 Local<Value> flags =
736 context_obj->Get(env->context(), env->flags_string()).ToLocalChecked();
737 if (flags->IsInt32()) {
738 static constexpr int32_t kCopyFlagsMask =
739 URL_FLAGS_SPECIAL |
740 URL_FLAGS_CANNOT_BE_BASE |
741 URL_FLAGS_HAS_USERNAME |
742 URL_FLAGS_HAS_PASSWORD |
743 URL_FLAGS_HAS_HOST;
744 context.flags |= flags.As<Int32>()->Value() & kCopyFlagsMask;
745 }
746 Local<Value> scheme =
747 context_obj->Get(env->context(), env->scheme_string()).ToLocalChecked();
748 if (scheme->IsString()) {
749 Utf8Value value(env->isolate(), scheme);
750 context.scheme.assign(*value, value.length());
751 }
752 Local<Value> port =
753 context_obj->Get(env->context(), env->port_string()).ToLocalChecked();
754 if (port->IsInt32())
755 context.port = port.As<Int32>()->Value();
756 if (context.flags & URL_FLAGS_HAS_USERNAME) {
757 Local<Value> username =
758 context_obj->Get(env->context(),
759 env->username_string()).ToLocalChecked();
760 CHECK(username->IsString());
761 Utf8Value value(env->isolate(), username);
762 context.username.assign(*value, value.length());
763 }
764 if (context.flags & URL_FLAGS_HAS_PASSWORD) {
765 Local<Value> password =
766 context_obj->Get(env->context(),
767 env->password_string()).ToLocalChecked();
768 CHECK(password->IsString());
769 Utf8Value value(env->isolate(), password);
770 context.password.assign(*value, value.length());
771 }
772 Local<Value> host =
773 context_obj->Get(env->context(),
774 env->host_string()).ToLocalChecked();
775 if (host->IsString()) {
776 Utf8Value value(env->isolate(), host);
777 context.host.assign(*value, value.length());
778 }
779 return context;
780 }
781
782 // Single dot segment can be ".", "%2e", or "%2E"
IsSingleDotSegment(const std::string & str)783 bool IsSingleDotSegment(const std::string& str) {
784 switch (str.size()) {
785 case 1:
786 return str == ".";
787 case 3:
788 return str[0] == '%' &&
789 str[1] == '2' &&
790 ASCIILowercase(str[2]) == 'e';
791 default:
792 return false;
793 }
794 }
795
796 // Double dot segment can be:
797 // "..", ".%2e", ".%2E", "%2e.", "%2E.",
798 // "%2e%2e", "%2E%2E", "%2e%2E", or "%2E%2e"
IsDoubleDotSegment(const std::string & str)799 bool IsDoubleDotSegment(const std::string& str) {
800 switch (str.size()) {
801 case 2:
802 return str == "..";
803 case 4:
804 if (str[0] != '.' && str[0] != '%')
805 return false;
806 return ((str[0] == '.' &&
807 str[1] == '%' &&
808 str[2] == '2' &&
809 ASCIILowercase(str[3]) == 'e') ||
810 (str[0] == '%' &&
811 str[1] == '2' &&
812 ASCIILowercase(str[2]) == 'e' &&
813 str[3] == '.'));
814 case 6:
815 return (str[0] == '%' &&
816 str[1] == '2' &&
817 ASCIILowercase(str[2]) == 'e' &&
818 str[3] == '%' &&
819 str[4] == '2' &&
820 ASCIILowercase(str[5]) == 'e');
821 default:
822 return false;
823 }
824 }
825
ShortenUrlPath(struct url_data * url)826 void ShortenUrlPath(struct url_data* url) {
827 if (url->path.empty()) return;
828 if (url->path.size() == 1 && url->scheme == "file:" &&
829 IsNormalizedWindowsDriveLetter(url->path[0])) return;
830 url->path.pop_back();
831 }
832
833 } // anonymous namespace
834
Parse(const char * input,size_t len,enum url_parse_state state_override,struct url_data * url,bool has_url,const struct url_data * base,bool has_base)835 void URL::Parse(const char* input,
836 size_t len,
837 enum url_parse_state state_override,
838 struct url_data* url,
839 bool has_url,
840 const struct url_data* base,
841 bool has_base) {
842 const char* p = input;
843 const char* end = input + len;
844
845 if (!has_url) {
846 for (const char* ptr = p; ptr < end; ptr++) {
847 if (IsC0ControlOrSpace(*ptr))
848 p++;
849 else
850 break;
851 }
852 for (const char* ptr = end - 1; ptr >= p; ptr--) {
853 if (IsC0ControlOrSpace(*ptr))
854 end--;
855 else
856 break;
857 }
858 input = p;
859 len = end - p;
860 }
861
862 // The spec says we should strip out any ASCII tabs or newlines.
863 // In those cases, we create another std::string instance with the filtered
864 // contents, but in the general case we avoid the overhead.
865 std::string whitespace_stripped;
866 for (const char* ptr = p; ptr < end; ptr++) {
867 if (!IsASCIITabOrNewline(*ptr))
868 continue;
869 // Hit tab or newline. Allocate storage, copy what we have until now,
870 // and then iterate and filter all similar characters out.
871 whitespace_stripped.reserve(len - 1);
872 whitespace_stripped.assign(p, ptr - p);
873 // 'ptr + 1' skips the current char, which we know to be tab or newline.
874 for (ptr = ptr + 1; ptr < end; ptr++) {
875 if (!IsASCIITabOrNewline(*ptr))
876 whitespace_stripped += *ptr;
877 }
878
879 // Update variables like they should have looked like if the string
880 // had been stripped of whitespace to begin with.
881 input = whitespace_stripped.c_str();
882 len = whitespace_stripped.size();
883 p = input;
884 end = input + len;
885 break;
886 }
887
888 bool atflag = false; // Set when @ has been seen.
889 bool square_bracket_flag = false; // Set inside of [...]
890 bool password_token_seen_flag = false; // Set after a : after an username.
891
892 std::string buffer;
893
894 // Set the initial parse state.
895 const bool has_state_override = state_override != kUnknownState;
896 enum url_parse_state state = has_state_override ? state_override :
897 kSchemeStart;
898
899 if (state < kSchemeStart || state > kFragment) {
900 url->flags |= URL_FLAGS_INVALID_PARSE_STATE;
901 return;
902 }
903
904 while (p <= end) {
905 const char ch = p < end ? p[0] : kEOL;
906 bool special = (url->flags & URL_FLAGS_SPECIAL);
907 bool cannot_be_base;
908 bool special_back_slash = (special && ch == '\\');
909
910 switch (state) {
911 case kSchemeStart:
912 if (IsASCIIAlpha(ch)) {
913 buffer += ASCIILowercase(ch);
914 state = kScheme;
915 } else if (!has_state_override) {
916 state = kNoScheme;
917 continue;
918 } else {
919 url->flags |= URL_FLAGS_FAILED;
920 return;
921 }
922 break;
923 case kScheme:
924 if (IsASCIIAlphanumeric(ch) || ch == '+' || ch == '-' || ch == '.') {
925 buffer += ASCIILowercase(ch);
926 } else if (ch == ':' || (has_state_override && ch == kEOL)) {
927 if (has_state_override && buffer.size() == 0) {
928 url->flags |= URL_FLAGS_TERMINATED;
929 return;
930 }
931 buffer += ':';
932
933 bool new_is_special = IsSpecial(buffer);
934
935 if (has_state_override) {
936 if ((special != new_is_special) ||
937 ((buffer == "file:") &&
938 ((url->flags & URL_FLAGS_HAS_USERNAME) ||
939 (url->flags & URL_FLAGS_HAS_PASSWORD) ||
940 (url->port != -1)))) {
941 url->flags |= URL_FLAGS_TERMINATED;
942 return;
943 }
944
945 // File scheme && (host == empty or null) check left to JS-land
946 // as it can be done before even entering C++ binding.
947 }
948
949 url->scheme = std::move(buffer);
950 url->port = NormalizePort(url->scheme, url->port);
951 if (new_is_special) {
952 url->flags |= URL_FLAGS_SPECIAL;
953 special = true;
954 } else {
955 url->flags &= ~URL_FLAGS_SPECIAL;
956 special = false;
957 }
958 special_back_slash = (special && ch == '\\');
959 buffer.clear();
960 if (has_state_override)
961 return;
962 if (url->scheme == "file:") {
963 state = kFile;
964 } else if (special &&
965 has_base &&
966 url->scheme == base->scheme) {
967 state = kSpecialRelativeOrAuthority;
968 } else if (special) {
969 state = kSpecialAuthoritySlashes;
970 } else if (p + 1 < end && p[1] == '/') {
971 state = kPathOrAuthority;
972 p++;
973 } else {
974 url->flags |= URL_FLAGS_CANNOT_BE_BASE;
975 url->flags |= URL_FLAGS_HAS_PATH;
976 url->path.emplace_back("");
977 state = kCannotBeBase;
978 }
979 } else if (!has_state_override) {
980 buffer.clear();
981 state = kNoScheme;
982 p = input;
983 continue;
984 } else {
985 url->flags |= URL_FLAGS_FAILED;
986 return;
987 }
988 break;
989 case kNoScheme:
990 cannot_be_base = has_base && (base->flags & URL_FLAGS_CANNOT_BE_BASE);
991 if (!has_base || (cannot_be_base && ch != '#')) {
992 url->flags |= URL_FLAGS_FAILED;
993 return;
994 } else if (cannot_be_base && ch == '#') {
995 url->scheme = base->scheme;
996 if (IsSpecial(url->scheme)) {
997 url->flags |= URL_FLAGS_SPECIAL;
998 special = true;
999 } else {
1000 url->flags &= ~URL_FLAGS_SPECIAL;
1001 special = false;
1002 }
1003 special_back_slash = (special && ch == '\\');
1004 if (base->flags & URL_FLAGS_HAS_PATH) {
1005 url->flags |= URL_FLAGS_HAS_PATH;
1006 url->path = base->path;
1007 }
1008 if (base->flags & URL_FLAGS_HAS_QUERY) {
1009 url->flags |= URL_FLAGS_HAS_QUERY;
1010 url->query = base->query;
1011 }
1012 if (base->flags & URL_FLAGS_HAS_FRAGMENT) {
1013 url->flags |= URL_FLAGS_HAS_FRAGMENT;
1014 url->fragment = base->fragment;
1015 }
1016 url->flags |= URL_FLAGS_CANNOT_BE_BASE;
1017 state = kFragment;
1018 } else if (has_base &&
1019 base->scheme != "file:") {
1020 state = kRelative;
1021 continue;
1022 } else {
1023 url->scheme = "file:";
1024 url->flags |= URL_FLAGS_SPECIAL;
1025 special = true;
1026 state = kFile;
1027 special_back_slash = (special && ch == '\\');
1028 continue;
1029 }
1030 break;
1031 case kSpecialRelativeOrAuthority:
1032 if (ch == '/' && p + 1 < end && p[1] == '/') {
1033 state = kSpecialAuthorityIgnoreSlashes;
1034 p++;
1035 } else {
1036 state = kRelative;
1037 continue;
1038 }
1039 break;
1040 case kPathOrAuthority:
1041 if (ch == '/') {
1042 state = kAuthority;
1043 } else {
1044 state = kPath;
1045 continue;
1046 }
1047 break;
1048 case kRelative:
1049 url->scheme = base->scheme;
1050 if (IsSpecial(url->scheme)) {
1051 url->flags |= URL_FLAGS_SPECIAL;
1052 special = true;
1053 } else {
1054 url->flags &= ~URL_FLAGS_SPECIAL;
1055 special = false;
1056 }
1057 special_back_slash = (special && ch == '\\');
1058 switch (ch) {
1059 case kEOL:
1060 if (base->flags & URL_FLAGS_HAS_USERNAME) {
1061 url->flags |= URL_FLAGS_HAS_USERNAME;
1062 url->username = base->username;
1063 }
1064 if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1065 url->flags |= URL_FLAGS_HAS_PASSWORD;
1066 url->password = base->password;
1067 }
1068 if (base->flags & URL_FLAGS_HAS_HOST) {
1069 url->flags |= URL_FLAGS_HAS_HOST;
1070 url->host = base->host;
1071 }
1072 if (base->flags & URL_FLAGS_HAS_QUERY) {
1073 url->flags |= URL_FLAGS_HAS_QUERY;
1074 url->query = base->query;
1075 }
1076 if (base->flags & URL_FLAGS_HAS_PATH) {
1077 url->flags |= URL_FLAGS_HAS_PATH;
1078 url->path = base->path;
1079 }
1080 url->port = base->port;
1081 break;
1082 case '/':
1083 state = kRelativeSlash;
1084 break;
1085 case '?':
1086 if (base->flags & URL_FLAGS_HAS_USERNAME) {
1087 url->flags |= URL_FLAGS_HAS_USERNAME;
1088 url->username = base->username;
1089 }
1090 if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1091 url->flags |= URL_FLAGS_HAS_PASSWORD;
1092 url->password = base->password;
1093 }
1094 if (base->flags & URL_FLAGS_HAS_HOST) {
1095 url->flags |= URL_FLAGS_HAS_HOST;
1096 url->host = base->host;
1097 }
1098 if (base->flags & URL_FLAGS_HAS_PATH) {
1099 url->flags |= URL_FLAGS_HAS_PATH;
1100 url->path = base->path;
1101 }
1102 url->port = base->port;
1103 state = kQuery;
1104 break;
1105 case '#':
1106 if (base->flags & URL_FLAGS_HAS_USERNAME) {
1107 url->flags |= URL_FLAGS_HAS_USERNAME;
1108 url->username = base->username;
1109 }
1110 if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1111 url->flags |= URL_FLAGS_HAS_PASSWORD;
1112 url->password = base->password;
1113 }
1114 if (base->flags & URL_FLAGS_HAS_HOST) {
1115 url->flags |= URL_FLAGS_HAS_HOST;
1116 url->host = base->host;
1117 }
1118 if (base->flags & URL_FLAGS_HAS_QUERY) {
1119 url->flags |= URL_FLAGS_HAS_QUERY;
1120 url->query = base->query;
1121 }
1122 if (base->flags & URL_FLAGS_HAS_PATH) {
1123 url->flags |= URL_FLAGS_HAS_PATH;
1124 url->path = base->path;
1125 }
1126 url->port = base->port;
1127 state = kFragment;
1128 break;
1129 default:
1130 if (special_back_slash) {
1131 state = kRelativeSlash;
1132 } else {
1133 if (base->flags & URL_FLAGS_HAS_USERNAME) {
1134 url->flags |= URL_FLAGS_HAS_USERNAME;
1135 url->username = base->username;
1136 }
1137 if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1138 url->flags |= URL_FLAGS_HAS_PASSWORD;
1139 url->password = base->password;
1140 }
1141 if (base->flags & URL_FLAGS_HAS_HOST) {
1142 url->flags |= URL_FLAGS_HAS_HOST;
1143 url->host = base->host;
1144 }
1145 if (base->flags & URL_FLAGS_HAS_PATH) {
1146 url->flags |= URL_FLAGS_HAS_PATH;
1147 url->path = base->path;
1148 ShortenUrlPath(url);
1149 }
1150 url->port = base->port;
1151 state = kPath;
1152 continue;
1153 }
1154 }
1155 break;
1156 case kRelativeSlash:
1157 if (IsSpecial(url->scheme) && (ch == '/' || ch == '\\')) {
1158 state = kSpecialAuthorityIgnoreSlashes;
1159 } else if (ch == '/') {
1160 state = kAuthority;
1161 } else {
1162 if (base->flags & URL_FLAGS_HAS_USERNAME) {
1163 url->flags |= URL_FLAGS_HAS_USERNAME;
1164 url->username = base->username;
1165 }
1166 if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1167 url->flags |= URL_FLAGS_HAS_PASSWORD;
1168 url->password = base->password;
1169 }
1170 if (base->flags & URL_FLAGS_HAS_HOST) {
1171 url->flags |= URL_FLAGS_HAS_HOST;
1172 url->host = base->host;
1173 }
1174 url->port = base->port;
1175 state = kPath;
1176 continue;
1177 }
1178 break;
1179 case kSpecialAuthoritySlashes:
1180 state = kSpecialAuthorityIgnoreSlashes;
1181 if (ch == '/' && p + 1 < end && p[1] == '/') {
1182 p++;
1183 } else {
1184 continue;
1185 }
1186 break;
1187 case kSpecialAuthorityIgnoreSlashes:
1188 if (ch != '/' && ch != '\\') {
1189 state = kAuthority;
1190 continue;
1191 }
1192 break;
1193 case kAuthority:
1194 if (ch == '@') {
1195 if (atflag) {
1196 buffer.reserve(buffer.size() + 3);
1197 buffer.insert(0, "%40");
1198 }
1199 atflag = true;
1200 size_t blen = buffer.size();
1201 if (blen > 0 && buffer[0] != ':') {
1202 url->flags |= URL_FLAGS_HAS_USERNAME;
1203 }
1204 for (size_t n = 0; n < blen; n++) {
1205 const char bch = buffer[n];
1206 if (bch == ':') {
1207 url->flags |= URL_FLAGS_HAS_PASSWORD;
1208 if (!password_token_seen_flag) {
1209 password_token_seen_flag = true;
1210 continue;
1211 }
1212 }
1213 if (password_token_seen_flag) {
1214 AppendOrEscape(&url->password, bch, USERINFO_ENCODE_SET);
1215 } else {
1216 AppendOrEscape(&url->username, bch, USERINFO_ENCODE_SET);
1217 }
1218 }
1219 buffer.clear();
1220 } else if (ch == kEOL ||
1221 ch == '/' ||
1222 ch == '?' ||
1223 ch == '#' ||
1224 special_back_slash) {
1225 if (atflag && buffer.size() == 0) {
1226 url->flags |= URL_FLAGS_FAILED;
1227 return;
1228 }
1229 p -= buffer.size() + 1;
1230 buffer.clear();
1231 state = kHost;
1232 } else {
1233 buffer += ch;
1234 }
1235 break;
1236 case kHost:
1237 case kHostname:
1238 if (has_state_override && url->scheme == "file:") {
1239 state = kFileHost;
1240 continue;
1241 } else if (ch == ':' && !square_bracket_flag) {
1242 if (buffer.size() == 0) {
1243 url->flags |= URL_FLAGS_FAILED;
1244 return;
1245 }
1246 url->flags |= URL_FLAGS_HAS_HOST;
1247 if (!ParseHost(buffer, &url->host, special)) {
1248 url->flags |= URL_FLAGS_FAILED;
1249 return;
1250 }
1251 buffer.clear();
1252 state = kPort;
1253 if (state_override == kHostname) {
1254 return;
1255 }
1256 } else if (ch == kEOL ||
1257 ch == '/' ||
1258 ch == '?' ||
1259 ch == '#' ||
1260 special_back_slash) {
1261 p--;
1262 if (special && buffer.size() == 0) {
1263 url->flags |= URL_FLAGS_FAILED;
1264 return;
1265 }
1266 if (has_state_override &&
1267 buffer.size() == 0 &&
1268 ((url->username.size() > 0 || url->password.size() > 0) ||
1269 url->port != -1)) {
1270 url->flags |= URL_FLAGS_TERMINATED;
1271 return;
1272 }
1273 url->flags |= URL_FLAGS_HAS_HOST;
1274 if (!ParseHost(buffer, &url->host, special)) {
1275 url->flags |= URL_FLAGS_FAILED;
1276 return;
1277 }
1278 buffer.clear();
1279 state = kPathStart;
1280 if (has_state_override) {
1281 return;
1282 }
1283 } else {
1284 if (ch == '[')
1285 square_bracket_flag = true;
1286 if (ch == ']')
1287 square_bracket_flag = false;
1288 buffer += ch;
1289 }
1290 break;
1291 case kPort:
1292 if (IsASCIIDigit(ch)) {
1293 buffer += ch;
1294 } else if (has_state_override ||
1295 ch == kEOL ||
1296 ch == '/' ||
1297 ch == '?' ||
1298 ch == '#' ||
1299 special_back_slash) {
1300 if (buffer.size() > 0) {
1301 unsigned port = 0;
1302 // the condition port <= 0xffff prevents integer overflow
1303 for (size_t i = 0; port <= 0xffff && i < buffer.size(); i++)
1304 port = port * 10 + buffer[i] - '0';
1305 if (port > 0xffff) {
1306 // TODO(TimothyGu): This hack is currently needed for the host
1307 // setter since it needs access to hostname if it is valid, and
1308 // if the FAILED flag is set the entire response to JS layer
1309 // will be empty.
1310 if (state_override == kHost)
1311 url->port = -1;
1312 else
1313 url->flags |= URL_FLAGS_FAILED;
1314 return;
1315 }
1316 // the port is valid
1317 url->port = NormalizePort(url->scheme, static_cast<int>(port));
1318 if (url->port == -1)
1319 url->flags |= URL_FLAGS_IS_DEFAULT_SCHEME_PORT;
1320 buffer.clear();
1321 } else if (has_state_override) {
1322 // TODO(TimothyGu): Similar case as above.
1323 if (state_override == kHost)
1324 url->port = -1;
1325 else
1326 url->flags |= URL_FLAGS_TERMINATED;
1327 return;
1328 }
1329 state = kPathStart;
1330 continue;
1331 } else {
1332 url->flags |= URL_FLAGS_FAILED;
1333 return;
1334 }
1335 break;
1336 case kFile:
1337 url->scheme = "file:";
1338 if (ch == '/' || ch == '\\') {
1339 state = kFileSlash;
1340 } else if (has_base && base->scheme == "file:") {
1341 switch (ch) {
1342 case kEOL:
1343 if (base->flags & URL_FLAGS_HAS_HOST) {
1344 url->flags |= URL_FLAGS_HAS_HOST;
1345 url->host = base->host;
1346 }
1347 if (base->flags & URL_FLAGS_HAS_PATH) {
1348 url->flags |= URL_FLAGS_HAS_PATH;
1349 url->path = base->path;
1350 }
1351 if (base->flags & URL_FLAGS_HAS_QUERY) {
1352 url->flags |= URL_FLAGS_HAS_QUERY;
1353 url->query = base->query;
1354 }
1355 break;
1356 case '?':
1357 if (base->flags & URL_FLAGS_HAS_HOST) {
1358 url->flags |= URL_FLAGS_HAS_HOST;
1359 url->host = base->host;
1360 }
1361 if (base->flags & URL_FLAGS_HAS_PATH) {
1362 url->flags |= URL_FLAGS_HAS_PATH;
1363 url->path = base->path;
1364 }
1365 url->flags |= URL_FLAGS_HAS_QUERY;
1366 url->query.clear();
1367 state = kQuery;
1368 break;
1369 case '#':
1370 if (base->flags & URL_FLAGS_HAS_HOST) {
1371 url->flags |= URL_FLAGS_HAS_HOST;
1372 url->host = base->host;
1373 }
1374 if (base->flags & URL_FLAGS_HAS_PATH) {
1375 url->flags |= URL_FLAGS_HAS_PATH;
1376 url->path = base->path;
1377 }
1378 if (base->flags & URL_FLAGS_HAS_QUERY) {
1379 url->flags |= URL_FLAGS_HAS_QUERY;
1380 url->query = base->query;
1381 }
1382 url->flags |= URL_FLAGS_HAS_FRAGMENT;
1383 url->fragment.clear();
1384 state = kFragment;
1385 break;
1386 default:
1387 if (!StartsWithWindowsDriveLetter(p, end)) {
1388 if (base->flags & URL_FLAGS_HAS_HOST) {
1389 url->flags |= URL_FLAGS_HAS_HOST;
1390 url->host = base->host;
1391 }
1392 if (base->flags & URL_FLAGS_HAS_PATH) {
1393 url->flags |= URL_FLAGS_HAS_PATH;
1394 url->path = base->path;
1395 }
1396 ShortenUrlPath(url);
1397 }
1398 state = kPath;
1399 continue;
1400 }
1401 } else {
1402 state = kPath;
1403 continue;
1404 }
1405 break;
1406 case kFileSlash:
1407 if (ch == '/' || ch == '\\') {
1408 state = kFileHost;
1409 } else {
1410 if (has_base &&
1411 base->scheme == "file:" &&
1412 !StartsWithWindowsDriveLetter(p, end)) {
1413 if (IsNormalizedWindowsDriveLetter(base->path[0])) {
1414 url->flags |= URL_FLAGS_HAS_PATH;
1415 url->path.push_back(base->path[0]);
1416 } else {
1417 if (base->flags & URL_FLAGS_HAS_HOST) {
1418 url->flags |= URL_FLAGS_HAS_HOST;
1419 url->host = base->host;
1420 } else {
1421 url->flags &= ~URL_FLAGS_HAS_HOST;
1422 url->host.clear();
1423 }
1424 }
1425 }
1426 state = kPath;
1427 continue;
1428 }
1429 break;
1430 case kFileHost:
1431 if (ch == kEOL ||
1432 ch == '/' ||
1433 ch == '\\' ||
1434 ch == '?' ||
1435 ch == '#') {
1436 if (!has_state_override &&
1437 buffer.size() == 2 &&
1438 IsWindowsDriveLetter(buffer)) {
1439 state = kPath;
1440 } else if (buffer.size() == 0) {
1441 url->flags |= URL_FLAGS_HAS_HOST;
1442 url->host.clear();
1443 if (has_state_override)
1444 return;
1445 state = kPathStart;
1446 } else {
1447 std::string host;
1448 if (!ParseHost(buffer, &host, special)) {
1449 url->flags |= URL_FLAGS_FAILED;
1450 return;
1451 }
1452 if (host == "localhost")
1453 host.clear();
1454 url->flags |= URL_FLAGS_HAS_HOST;
1455 url->host = host;
1456 if (has_state_override)
1457 return;
1458 buffer.clear();
1459 state = kPathStart;
1460 }
1461 continue;
1462 } else {
1463 buffer += ch;
1464 }
1465 break;
1466 case kPathStart:
1467 if (IsSpecial(url->scheme)) {
1468 state = kPath;
1469 if (ch != '/' && ch != '\\') {
1470 continue;
1471 }
1472 } else if (!has_state_override && ch == '?') {
1473 url->flags |= URL_FLAGS_HAS_QUERY;
1474 url->query.clear();
1475 state = kQuery;
1476 } else if (!has_state_override && ch == '#') {
1477 url->flags |= URL_FLAGS_HAS_FRAGMENT;
1478 url->fragment.clear();
1479 state = kFragment;
1480 } else if (ch != kEOL) {
1481 state = kPath;
1482 if (ch != '/') {
1483 continue;
1484 }
1485 }
1486 break;
1487 case kPath:
1488 if (ch == kEOL ||
1489 ch == '/' ||
1490 special_back_slash ||
1491 (!has_state_override && (ch == '?' || ch == '#'))) {
1492 if (IsDoubleDotSegment(buffer)) {
1493 ShortenUrlPath(url);
1494 if (ch != '/' && !special_back_slash) {
1495 url->flags |= URL_FLAGS_HAS_PATH;
1496 url->path.emplace_back("");
1497 }
1498 } else if (IsSingleDotSegment(buffer) &&
1499 ch != '/' && !special_back_slash) {
1500 url->flags |= URL_FLAGS_HAS_PATH;
1501 url->path.emplace_back("");
1502 } else if (!IsSingleDotSegment(buffer)) {
1503 if (url->scheme == "file:" &&
1504 url->path.empty() &&
1505 buffer.size() == 2 &&
1506 IsWindowsDriveLetter(buffer)) {
1507 if ((url->flags & URL_FLAGS_HAS_HOST) &&
1508 !url->host.empty()) {
1509 url->host.clear();
1510 url->flags |= URL_FLAGS_HAS_HOST;
1511 }
1512 buffer[1] = ':';
1513 }
1514 url->flags |= URL_FLAGS_HAS_PATH;
1515 url->path.emplace_back(std::move(buffer));
1516 }
1517 buffer.clear();
1518 if (url->scheme == "file:" &&
1519 (ch == kEOL ||
1520 ch == '?' ||
1521 ch == '#')) {
1522 while (url->path.size() > 1 && url->path[0].empty()) {
1523 url->path.erase(url->path.begin());
1524 }
1525 }
1526 if (ch == '?') {
1527 url->flags |= URL_FLAGS_HAS_QUERY;
1528 state = kQuery;
1529 } else if (ch == '#') {
1530 state = kFragment;
1531 }
1532 } else {
1533 AppendOrEscape(&buffer, ch, PATH_ENCODE_SET);
1534 }
1535 break;
1536 case kCannotBeBase:
1537 switch (ch) {
1538 case '?':
1539 state = kQuery;
1540 break;
1541 case '#':
1542 state = kFragment;
1543 break;
1544 default:
1545 if (url->path.empty())
1546 url->path.emplace_back("");
1547 else if (ch != kEOL)
1548 AppendOrEscape(&url->path[0], ch, C0_CONTROL_ENCODE_SET);
1549 }
1550 break;
1551 case kQuery:
1552 if (ch == kEOL || (!has_state_override && ch == '#')) {
1553 url->flags |= URL_FLAGS_HAS_QUERY;
1554 url->query = std::move(buffer);
1555 buffer.clear();
1556 if (ch == '#')
1557 state = kFragment;
1558 } else {
1559 AppendOrEscape(&buffer, ch, special ? QUERY_ENCODE_SET_SPECIAL :
1560 QUERY_ENCODE_SET_NONSPECIAL);
1561 }
1562 break;
1563 case kFragment:
1564 switch (ch) {
1565 case kEOL:
1566 url->flags |= URL_FLAGS_HAS_FRAGMENT;
1567 url->fragment = std::move(buffer);
1568 break;
1569 case 0:
1570 break;
1571 default:
1572 AppendOrEscape(&buffer, ch, FRAGMENT_ENCODE_SET);
1573 }
1574 break;
1575 default:
1576 url->flags |= URL_FLAGS_INVALID_PARSE_STATE;
1577 return;
1578 }
1579
1580 p++;
1581 }
1582 } // NOLINT(readability/fn_size)
1583
1584 namespace {
SetArgs(Environment * env,Local<Value> argv[ARG_COUNT],const struct url_data & url)1585 void SetArgs(Environment* env,
1586 Local<Value> argv[ARG_COUNT],
1587 const struct url_data& url) {
1588 Isolate* isolate = env->isolate();
1589 argv[ARG_FLAGS] = Integer::NewFromUnsigned(isolate, url.flags);
1590 argv[ARG_PROTOCOL] =
1591 url.flags & URL_FLAGS_SPECIAL ?
1592 GetSpecial(env, url.scheme) :
1593 OneByteString(isolate, url.scheme.c_str());
1594 if (url.flags & URL_FLAGS_HAS_USERNAME)
1595 argv[ARG_USERNAME] = Utf8String(isolate, url.username);
1596 if (url.flags & URL_FLAGS_HAS_PASSWORD)
1597 argv[ARG_PASSWORD] = Utf8String(isolate, url.password);
1598 if (url.flags & URL_FLAGS_HAS_HOST)
1599 argv[ARG_HOST] = Utf8String(isolate, url.host);
1600 if (url.flags & URL_FLAGS_HAS_QUERY)
1601 argv[ARG_QUERY] = Utf8String(isolate, url.query);
1602 if (url.flags & URL_FLAGS_HAS_FRAGMENT)
1603 argv[ARG_FRAGMENT] = Utf8String(isolate, url.fragment);
1604 if (url.port > -1)
1605 argv[ARG_PORT] = Integer::New(isolate, url.port);
1606 if (url.flags & URL_FLAGS_HAS_PATH)
1607 argv[ARG_PATH] = ToV8Value(env->context(), url.path).ToLocalChecked();
1608 }
1609
Parse(Environment * env,Local<Value> recv,const char * input,size_t len,enum url_parse_state state_override,Local<Value> base_obj,Local<Value> context_obj,Local<Function> cb,Local<Value> error_cb)1610 void Parse(Environment* env,
1611 Local<Value> recv,
1612 const char* input,
1613 size_t len,
1614 enum url_parse_state state_override,
1615 Local<Value> base_obj,
1616 Local<Value> context_obj,
1617 Local<Function> cb,
1618 Local<Value> error_cb) {
1619 Isolate* isolate = env->isolate();
1620 Local<Context> context = env->context();
1621 HandleScope handle_scope(isolate);
1622 Context::Scope context_scope(context);
1623
1624 const bool has_context = context_obj->IsObject();
1625 const bool has_base = base_obj->IsObject();
1626
1627 url_data base;
1628 url_data url;
1629 if (has_context)
1630 url = HarvestContext(env, context_obj.As<Object>());
1631 if (has_base)
1632 base = HarvestBase(env, base_obj.As<Object>());
1633
1634 URL::Parse(input, len, state_override, &url, has_context, &base, has_base);
1635 if ((url.flags & URL_FLAGS_INVALID_PARSE_STATE) ||
1636 ((state_override != kUnknownState) &&
1637 (url.flags & URL_FLAGS_TERMINATED)))
1638 return;
1639
1640 // Define the return value placeholders
1641 const Local<Value> undef = Undefined(isolate);
1642 const Local<Value> null = Null(isolate);
1643 if (!(url.flags & URL_FLAGS_FAILED)) {
1644 Local<Value> argv[] = {
1645 undef,
1646 undef,
1647 undef,
1648 undef,
1649 null, // host defaults to null
1650 null, // port defaults to null
1651 undef,
1652 null, // query defaults to null
1653 null, // fragment defaults to null
1654 };
1655 SetArgs(env, argv, url);
1656 cb->Call(context, recv, arraysize(argv), argv).FromMaybe(Local<Value>());
1657 } else if (error_cb->IsFunction()) {
1658 Local<Value> argv[2] = { undef, undef };
1659 argv[ERR_ARG_FLAGS] = Integer::NewFromUnsigned(isolate, url.flags);
1660 argv[ERR_ARG_INPUT] =
1661 String::NewFromUtf8(env->isolate(), input).ToLocalChecked();
1662 error_cb.As<Function>()->Call(context, recv, arraysize(argv), argv)
1663 .FromMaybe(Local<Value>());
1664 }
1665 }
1666
Parse(const FunctionCallbackInfo<Value> & args)1667 void Parse(const FunctionCallbackInfo<Value>& args) {
1668 Environment* env = Environment::GetCurrent(args);
1669 CHECK_GE(args.Length(), 5);
1670 CHECK(args[0]->IsString()); // input
1671 CHECK(args[2]->IsUndefined() || // base context
1672 args[2]->IsNull() ||
1673 args[2]->IsObject());
1674 CHECK(args[3]->IsUndefined() || // context
1675 args[3]->IsNull() ||
1676 args[3]->IsObject());
1677 CHECK(args[4]->IsFunction()); // complete callback
1678 CHECK(args[5]->IsUndefined() || args[5]->IsFunction()); // error callback
1679
1680 Utf8Value input(env->isolate(), args[0]);
1681 enum url_parse_state state_override = kUnknownState;
1682 if (args[1]->IsNumber()) {
1683 state_override = static_cast<enum url_parse_state>(
1684 args[1]->Uint32Value(env->context()).FromJust());
1685 }
1686
1687 Parse(env, args.This(),
1688 *input, input.length(),
1689 state_override,
1690 args[2],
1691 args[3],
1692 args[4].As<Function>(),
1693 args[5]);
1694 }
1695
EncodeAuthSet(const FunctionCallbackInfo<Value> & args)1696 void EncodeAuthSet(const FunctionCallbackInfo<Value>& args) {
1697 Environment* env = Environment::GetCurrent(args);
1698 CHECK_GE(args.Length(), 1);
1699 CHECK(args[0]->IsString());
1700 Utf8Value value(env->isolate(), args[0]);
1701 std::string output;
1702 size_t len = value.length();
1703 output.reserve(len);
1704 for (size_t n = 0; n < len; n++) {
1705 const char ch = (*value)[n];
1706 AppendOrEscape(&output, ch, USERINFO_ENCODE_SET);
1707 }
1708 args.GetReturnValue().Set(
1709 String::NewFromUtf8(env->isolate(), output.c_str()).ToLocalChecked());
1710 }
1711
ToUSVString(const FunctionCallbackInfo<Value> & args)1712 void ToUSVString(const FunctionCallbackInfo<Value>& args) {
1713 Environment* env = Environment::GetCurrent(args);
1714 CHECK_GE(args.Length(), 2);
1715 CHECK(args[0]->IsString());
1716 CHECK(args[1]->IsNumber());
1717
1718 TwoByteValue value(env->isolate(), args[0]);
1719
1720 int64_t start = args[1]->IntegerValue(env->context()).FromJust();
1721 CHECK_GE(start, 0);
1722
1723 for (size_t i = start; i < value.length(); i++) {
1724 char16_t c = value[i];
1725 if (!IsUnicodeSurrogate(c)) {
1726 continue;
1727 } else if (IsUnicodeSurrogateTrail(c) || i == value.length() - 1) {
1728 value[i] = kUnicodeReplacementCharacter;
1729 } else {
1730 char16_t d = value[i + 1];
1731 if (IsUnicodeTrail(d)) {
1732 i++;
1733 } else {
1734 value[i] = kUnicodeReplacementCharacter;
1735 }
1736 }
1737 }
1738
1739 args.GetReturnValue().Set(
1740 String::NewFromTwoByte(env->isolate(),
1741 *value,
1742 NewStringType::kNormal,
1743 value.length()).ToLocalChecked());
1744 }
1745
DomainToASCII(const FunctionCallbackInfo<Value> & args)1746 void DomainToASCII(const FunctionCallbackInfo<Value>& args) {
1747 Environment* env = Environment::GetCurrent(args);
1748 CHECK_GE(args.Length(), 1);
1749 CHECK(args[0]->IsString());
1750 Utf8Value value(env->isolate(), args[0]);
1751
1752 URLHost host;
1753 // Assuming the host is used for a special scheme.
1754 host.ParseHost(*value, value.length(), true);
1755 if (host.ParsingFailed()) {
1756 args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), ""));
1757 return;
1758 }
1759 std::string out = host.ToStringMove();
1760 args.GetReturnValue().Set(
1761 String::NewFromUtf8(env->isolate(), out.c_str()).ToLocalChecked());
1762 }
1763
DomainToUnicode(const FunctionCallbackInfo<Value> & args)1764 void DomainToUnicode(const FunctionCallbackInfo<Value>& args) {
1765 Environment* env = Environment::GetCurrent(args);
1766 CHECK_GE(args.Length(), 1);
1767 CHECK(args[0]->IsString());
1768 Utf8Value value(env->isolate(), args[0]);
1769
1770 URLHost host;
1771 // Assuming the host is used for a special scheme.
1772 host.ParseHost(*value, value.length(), true, true);
1773 if (host.ParsingFailed()) {
1774 args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), ""));
1775 return;
1776 }
1777 std::string out = host.ToStringMove();
1778 args.GetReturnValue().Set(
1779 String::NewFromUtf8(env->isolate(), out.c_str()).ToLocalChecked());
1780 }
1781
SetURLConstructor(const FunctionCallbackInfo<Value> & args)1782 void SetURLConstructor(const FunctionCallbackInfo<Value>& args) {
1783 Environment* env = Environment::GetCurrent(args);
1784 CHECK_EQ(args.Length(), 1);
1785 CHECK(args[0]->IsFunction());
1786 env->set_url_constructor_function(args[0].As<Function>());
1787 }
1788
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)1789 void Initialize(Local<Object> target,
1790 Local<Value> unused,
1791 Local<Context> context,
1792 void* priv) {
1793 Environment* env = Environment::GetCurrent(context);
1794 env->SetMethod(target, "parse", Parse);
1795 env->SetMethodNoSideEffect(target, "encodeAuth", EncodeAuthSet);
1796 env->SetMethodNoSideEffect(target, "toUSVString", ToUSVString);
1797 env->SetMethodNoSideEffect(target, "domainToASCII", DomainToASCII);
1798 env->SetMethodNoSideEffect(target, "domainToUnicode", DomainToUnicode);
1799 env->SetMethod(target, "setURLConstructor", SetURLConstructor);
1800
1801 #define XX(name, _) NODE_DEFINE_CONSTANT(target, name);
1802 FLAGS(XX)
1803 #undef XX
1804
1805 #define XX(name) NODE_DEFINE_CONSTANT(target, name);
1806 PARSESTATES(XX)
1807 #undef XX
1808 }
1809 } // namespace
1810
ToFilePath() const1811 std::string URL::ToFilePath() const {
1812 if (context_.scheme != "file:") {
1813 return "";
1814 }
1815
1816 #ifdef _WIN32
1817 const char* slash = "\\";
1818 auto is_slash = [] (char ch) {
1819 return ch == '/' || ch == '\\';
1820 };
1821 #else
1822 const char* slash = "/";
1823 auto is_slash = [] (char ch) {
1824 return ch == '/';
1825 };
1826 if ((context_.flags & URL_FLAGS_HAS_HOST) &&
1827 context_.host.length() > 0) {
1828 return "";
1829 }
1830 #endif
1831 std::string decoded_path;
1832 for (const std::string& part : context_.path) {
1833 std::string decoded = PercentDecode(part.c_str(), part.length());
1834 for (char& ch : decoded) {
1835 if (is_slash(ch)) {
1836 return "";
1837 }
1838 }
1839 decoded_path += slash + decoded;
1840 }
1841
1842 #ifdef _WIN32
1843 // TODO(TimothyGu): Use "\\?\" long paths on Windows.
1844
1845 // If hostname is set, then we have a UNC path. Pass the hostname through
1846 // ToUnicode just in case it is an IDN using punycode encoding. We do not
1847 // need to worry about percent encoding because the URL parser will have
1848 // already taken care of that for us. Note that this only causes IDNs with an
1849 // appropriate `xn--` prefix to be decoded.
1850 if ((context_.flags & URL_FLAGS_HAS_HOST) &&
1851 context_.host.length() > 0) {
1852 std::string unicode_host;
1853 if (!ToUnicode(context_.host, &unicode_host)) {
1854 return "";
1855 }
1856 return "\\\\" + unicode_host + decoded_path;
1857 }
1858 // Otherwise, it's a local path that requires a drive letter.
1859 if (decoded_path.length() < 3) {
1860 return "";
1861 }
1862 if (decoded_path[2] != ':' ||
1863 !IsASCIIAlpha(decoded_path[1])) {
1864 return "";
1865 }
1866 // Strip out the leading '\'.
1867 return decoded_path.substr(1);
1868 #else
1869 return decoded_path;
1870 #endif
1871 }
1872
FromFilePath(const std::string & file_path)1873 URL URL::FromFilePath(const std::string& file_path) {
1874 URL url("file://");
1875 std::string escaped_file_path;
1876 for (size_t i = 0; i < file_path.length(); ++i) {
1877 escaped_file_path += file_path[i];
1878 if (file_path[i] == '%')
1879 escaped_file_path += "25";
1880 }
1881 URL::Parse(escaped_file_path.c_str(), escaped_file_path.length(), kPathStart,
1882 &url.context_, true, nullptr, false);
1883 return url;
1884 }
1885
1886 // This function works by calling out to a JS function that creates and
1887 // returns the JS URL object. Be mindful of the JS<->Native boundary
1888 // crossing that is required.
ToObject(Environment * env) const1889 MaybeLocal<Value> URL::ToObject(Environment* env) const {
1890 Isolate* isolate = env->isolate();
1891 Local<Context> context = env->context();
1892 Context::Scope context_scope(context);
1893
1894 const Local<Value> undef = Undefined(isolate);
1895 const Local<Value> null = Null(isolate);
1896
1897 if (context_.flags & URL_FLAGS_FAILED)
1898 return Local<Value>();
1899
1900 Local<Value> argv[] = {
1901 undef,
1902 undef,
1903 undef,
1904 undef,
1905 null, // host defaults to null
1906 null, // port defaults to null
1907 undef,
1908 null, // query defaults to null
1909 null, // fragment defaults to null
1910 };
1911 SetArgs(env, argv, context_);
1912
1913 MaybeLocal<Value> ret;
1914 {
1915 TryCatchScope try_catch(env, TryCatchScope::CatchMode::kFatal);
1916
1917 // The SetURLConstructor method must have been called already to
1918 // set the constructor function used below. SetURLConstructor is
1919 // called automatically when the internal/url.js module is loaded
1920 // during the internal/bootstrap/node.js processing.
1921 ret = env->url_constructor_function()
1922 ->Call(env->context(), undef, arraysize(argv), argv);
1923 }
1924
1925 return ret;
1926 }
1927
1928 } // namespace url
1929 } // namespace node
1930
1931 NODE_MODULE_CONTEXT_AWARE_INTERNAL(url, node::url::Initialize)
1932