• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "node_url.h"
2 #include "base_object-inl.h"
3 #include "node_errors.h"
4 #include "node_i18n.h"
5 #include "util-inl.h"
6 
7 #include <cmath>
8 #include <cstdio>
9 #include <string>
10 #include <vector>
11 
12 namespace node {
13 
14 using errors::TryCatchScope;
15 
16 using url::table_data::hex;
17 using url::table_data::C0_CONTROL_ENCODE_SET;
18 using url::table_data::FRAGMENT_ENCODE_SET;
19 using url::table_data::PATH_ENCODE_SET;
20 using url::table_data::USERINFO_ENCODE_SET;
21 using url::table_data::QUERY_ENCODE_SET_NONSPECIAL;
22 using url::table_data::QUERY_ENCODE_SET_SPECIAL;
23 
24 using v8::Array;
25 using v8::Context;
26 using v8::Function;
27 using v8::FunctionCallbackInfo;
28 using v8::HandleScope;
29 using v8::Int32;
30 using v8::Integer;
31 using v8::Isolate;
32 using v8::Local;
33 using v8::MaybeLocal;
34 using v8::NewStringType;
35 using v8::Null;
36 using v8::Object;
37 using v8::String;
38 using v8::Undefined;
39 using v8::Value;
40 
Utf8String(Isolate * isolate,const std::string & str)41 Local<String> Utf8String(Isolate* isolate, const std::string& str) {
42   return String::NewFromUtf8(isolate,
43                              str.data(),
44                              NewStringType::kNormal,
45                              str.length()).ToLocalChecked();
46 }
47 
48 namespace url {
49 namespace {
50 
51 // https://url.spec.whatwg.org/#eof-code-point
52 constexpr char kEOL = -1;
53 
54 // Used in ToUSVString().
55 constexpr char16_t kUnicodeReplacementCharacter = 0xFFFD;
56 
57 // https://url.spec.whatwg.org/#concept-host
58 class URLHost {
59  public:
60   ~URLHost();
61 
62   void ParseIPv4Host(const char* input, size_t length, bool* is_ipv4);
63   void ParseIPv6Host(const char* input, size_t length);
64   void ParseOpaqueHost(const char* input, size_t length);
65   void ParseHost(const char* input,
66                  size_t length,
67                  bool is_special,
68                  bool unicode = false);
69 
ParsingFailed() const70   bool ParsingFailed() const { return type_ == HostType::H_FAILED; }
71   std::string ToString() const;
72   // Like ToString(), but avoids a copy in exchange for invalidating `*this`.
73   std::string ToStringMove();
74 
75  private:
76   enum class HostType {
77     H_FAILED,
78     H_DOMAIN,
79     H_IPV4,
80     H_IPV6,
81     H_OPAQUE,
82   };
83 
84   union Value {
85     std::string domain_or_opaque;
86     uint32_t ipv4;
87     uint16_t ipv6[8];
88 
~Value()89     ~Value() {}
Value()90     Value() : ipv4(0) {}
91   };
92 
93   Value value_;
94   HostType type_ = HostType::H_FAILED;
95 
Reset()96   void Reset() {
97     using string = std::string;
98     switch (type_) {
99       case HostType::H_DOMAIN:
100       case HostType::H_OPAQUE:
101         value_.domain_or_opaque.~string();
102         break;
103       default:
104         break;
105     }
106     type_ = HostType::H_FAILED;
107   }
108 
109   // Setting the string members of the union with = is brittle because
110   // it relies on them being initialized to a state that requires no
111   // destruction of old data.
112   // For a long time, that worked well enough because ParseIPv6Host() happens
113   // to zero-fill `value_`, but that really is relying on standard library
114   // internals too much.
115   // These helpers are the easiest solution but we might want to consider
116   // just not forcing strings into an union.
SetOpaque(std::string && string)117   void SetOpaque(std::string&& string) {
118     Reset();
119     type_ = HostType::H_OPAQUE;
120     new(&value_.domain_or_opaque) std::string(std::move(string));
121   }
122 
SetDomain(std::string && string)123   void SetDomain(std::string&& string) {
124     Reset();
125     type_ = HostType::H_DOMAIN;
126     new(&value_.domain_or_opaque) std::string(std::move(string));
127   }
128 };
129 
~URLHost()130 URLHost::~URLHost() {
131   Reset();
132 }
133 
134 #define ARGS(XX)                                                              \
135   XX(ARG_FLAGS)                                                               \
136   XX(ARG_PROTOCOL)                                                            \
137   XX(ARG_USERNAME)                                                            \
138   XX(ARG_PASSWORD)                                                            \
139   XX(ARG_HOST)                                                                \
140   XX(ARG_PORT)                                                                \
141   XX(ARG_PATH)                                                                \
142   XX(ARG_QUERY)                                                               \
143   XX(ARG_FRAGMENT)                                                            \
144   XX(ARG_COUNT)  // This one has to be last.
145 
146 #define ERR_ARGS(XX)                                                          \
147   XX(ERR_ARG_FLAGS)                                                           \
148   XX(ERR_ARG_INPUT)                                                           \
149 
150 enum url_cb_args {
151 #define XX(name) name,
152   ARGS(XX)
153 #undef XX
154 };
155 
156 enum url_error_cb_args {
157 #define XX(name) name,
158   ERR_ARGS(XX)
159 #undef XX
160 };
161 
162 #define CHAR_TEST(bits, name, expr)                                           \
163   template <typename T>                                                       \
164   bool name(const T ch) {                                              \
165     static_assert(sizeof(ch) >= (bits) / 8,                                   \
166                   "Character must be wider than " #bits " bits");             \
167     return (expr);                                                            \
168   }
169 
170 #define TWO_CHAR_STRING_TEST(bits, name, expr)                                \
171   template <typename T>                                                       \
172   bool name(const T ch1, const T ch2) {                                \
173     static_assert(sizeof(ch1) >= (bits) / 8,                                  \
174                   "Character must be wider than " #bits " bits");             \
175     return (expr);                                                            \
176   }                                                                           \
177   template <typename T>                                                       \
178   bool name(const std::basic_string<T>& str) {                         \
179     static_assert(sizeof(str[0]) >= (bits) / 8,                               \
180                   "Character must be wider than " #bits " bits");             \
181     return str.length() >= 2 && name(str[0], str[1]);                         \
182   }
183 
184 // https://infra.spec.whatwg.org/#ascii-tab-or-newline
185 CHAR_TEST(8, IsASCIITabOrNewline, (ch == '\t' || ch == '\n' || ch == '\r'))
186 
187 // https://infra.spec.whatwg.org/#c0-control-or-space
188 CHAR_TEST(8, IsC0ControlOrSpace, (ch >= '\0' && ch <= ' '))
189 
190 // https://infra.spec.whatwg.org/#ascii-digit
191 CHAR_TEST(8, IsASCIIDigit, (ch >= '0' && ch <= '9'))
192 
193 // https://infra.spec.whatwg.org/#ascii-hex-digit
194 CHAR_TEST(8, IsASCIIHexDigit, (IsASCIIDigit(ch) ||
195                                (ch >= 'A' && ch <= 'F') ||
196                                (ch >= 'a' && ch <= 'f')))
197 
198 // https://infra.spec.whatwg.org/#ascii-alpha
199 CHAR_TEST(8, IsASCIIAlpha, ((ch >= 'A' && ch <= 'Z') ||
200                             (ch >= 'a' && ch <= 'z')))
201 
202 // https://infra.spec.whatwg.org/#ascii-alphanumeric
203 CHAR_TEST(8, IsASCIIAlphanumeric, (IsASCIIDigit(ch) || IsASCIIAlpha(ch)))
204 
205 // https://infra.spec.whatwg.org/#ascii-lowercase
206 template <typename T>
ASCIILowercase(T ch)207 T ASCIILowercase(T ch) {
208   return IsASCIIAlpha(ch) ? (ch | 0x20) : ch;
209 }
210 
211 // https://url.spec.whatwg.org/#forbidden-host-code-point
212 CHAR_TEST(8, IsForbiddenHostCodePoint,
213           ch == '\0' || ch == '\t' || ch == '\n' || ch == '\r' ||
214           ch == ' ' || ch == '#' || ch == '%' || ch == '/' ||
215           ch == ':' || ch == '?' || ch == '@' || ch == '[' ||
216           ch == '\\' || ch == ']')
217 
218 // https://url.spec.whatwg.org/#windows-drive-letter
219 TWO_CHAR_STRING_TEST(8, IsWindowsDriveLetter,
220                      (IsASCIIAlpha(ch1) && (ch2 == ':' || ch2 == '|')))
221 
222 // https://url.spec.whatwg.org/#normalized-windows-drive-letter
223 TWO_CHAR_STRING_TEST(8, IsNormalizedWindowsDriveLetter,
224                      (IsASCIIAlpha(ch1) && ch2 == ':'))
225 
226 // If a UTF-16 character is a low/trailing surrogate.
227 CHAR_TEST(16, IsUnicodeTrail, (ch & 0xFC00) == 0xDC00)
228 
229 // If a UTF-16 character is a surrogate.
230 CHAR_TEST(16, IsUnicodeSurrogate, (ch & 0xF800) == 0xD800)
231 
232 // If a UTF-16 surrogate is a low/trailing one.
233 CHAR_TEST(16, IsUnicodeSurrogateTrail, (ch & 0x400) != 0)
234 
235 #undef CHAR_TEST
236 #undef TWO_CHAR_STRING_TEST
237 
238 
BitAt(const uint8_t a[],const uint8_t i)239 bool BitAt(const uint8_t a[], const uint8_t i) {
240   return !!(a[i >> 3] & (1 << (i & 7)));
241 }
242 
243 // Appends ch to str. If ch position in encode_set is set, the ch will
244 // be percent-encoded then appended.
AppendOrEscape(std::string * str,const unsigned char ch,const uint8_t encode_set[])245 void AppendOrEscape(std::string* str,
246                     const unsigned char ch,
247                     const uint8_t encode_set[]) {
248   if (BitAt(encode_set, ch))
249     *str += hex + ch * 4;  // "%XX\0" has a length of 4
250   else
251     *str += ch;
252 }
253 
254 template <typename T>
hex2bin(const T ch)255 unsigned hex2bin(const T ch) {
256   if (ch >= '0' && ch <= '9')
257     return ch - '0';
258   if (ch >= 'A' && ch <= 'F')
259     return 10 + (ch - 'A');
260   if (ch >= 'a' && ch <= 'f')
261     return 10 + (ch - 'a');
262   return static_cast<unsigned>(-1);
263 }
264 
PercentDecode(const char * input,size_t len)265 std::string PercentDecode(const char* input, size_t len) {
266   std::string dest;
267   if (len == 0)
268     return dest;
269   dest.reserve(len);
270   const char* pointer = input;
271   const char* end = input + len;
272 
273   while (pointer < end) {
274     const char ch = pointer[0];
275     size_t remaining = end - pointer - 1;
276     if (ch != '%' || remaining < 2 ||
277         (ch == '%' &&
278          (!IsASCIIHexDigit(pointer[1]) ||
279           !IsASCIIHexDigit(pointer[2])))) {
280       dest += ch;
281       pointer++;
282       continue;
283     } else {
284       unsigned a = hex2bin(pointer[1]);
285       unsigned b = hex2bin(pointer[2]);
286       char c = static_cast<char>(a * 16 + b);
287       dest += c;
288       pointer += 3;
289     }
290   }
291   return dest;
292 }
293 
294 #define SPECIALS(XX)                                                          \
295   XX(ftp, 21, "ftp:")                                                         \
296   XX(file, -1, "file:")                                                       \
297   XX(gopher, 70, "gopher:")                                                   \
298   XX(http, 80, "http:")                                                       \
299   XX(https, 443, "https:")                                                    \
300   XX(ws, 80, "ws:")                                                           \
301   XX(wss, 443, "wss:")
302 
IsSpecial(const std::string & scheme)303 bool IsSpecial(const std::string& scheme) {
304 #define V(_, __, name) if (scheme == name) return true;
305   SPECIALS(V);
306 #undef V
307   return false;
308 }
309 
GetSpecial(Environment * env,const std::string & scheme)310 Local<String> GetSpecial(Environment* env, const std::string& scheme) {
311 #define V(key, _, name) if (scheme == name)                                  \
312     return env->url_special_##key##_string();
313   SPECIALS(V)
314 #undef V
315   UNREACHABLE();
316 }
317 
NormalizePort(const std::string & scheme,int p)318 int NormalizePort(const std::string& scheme, int p) {
319 #define V(_, port, name) if (scheme == name && p == port) return -1;
320   SPECIALS(V);
321 #undef V
322   return p;
323 }
324 
325 // https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
StartsWithWindowsDriveLetter(const char * p,const char * end)326 bool StartsWithWindowsDriveLetter(const char* p, const char* end) {
327   size_t length = end - p;
328   return length >= 2 &&
329     IsWindowsDriveLetter(p[0], p[1]) &&
330     (length == 2 ||
331       p[2] == '/' ||
332       p[2] == '\\' ||
333       p[2] == '?' ||
334       p[2] == '#');
335 }
336 
337 #if defined(NODE_HAVE_I18N_SUPPORT)
ToUnicode(const std::string & input,std::string * output)338 bool ToUnicode(const std::string& input, std::string* output) {
339   MaybeStackBuffer<char> buf;
340   if (i18n::ToUnicode(&buf, input.c_str(), input.length()) < 0)
341     return false;
342   output->assign(*buf, buf.length());
343   return true;
344 }
345 
ToASCII(const std::string & input,std::string * output)346 bool ToASCII(const std::string& input, std::string* output) {
347   MaybeStackBuffer<char> buf;
348   if (i18n::ToASCII(&buf, input.c_str(), input.length()) < 0)
349     return false;
350   output->assign(*buf, buf.length());
351   return true;
352 }
353 #else
354 // Intentional non-ops if ICU is not present.
ToUnicode(const std::string & input,std::string * output)355 bool ToUnicode(const std::string& input, std::string* output) {
356   *output = input;
357   return true;
358 }
359 
ToASCII(const std::string & input,std::string * output)360 bool ToASCII(const std::string& input, std::string* output) {
361   *output = input;
362   return true;
363 }
364 #endif
365 
366 #define NS_IN6ADDRSZ 16
367 
ParseIPv6Host(const char * input,size_t length)368 void URLHost::ParseIPv6Host(const char* input, size_t length) {
369   CHECK_EQ(type_, HostType::H_FAILED);
370 
371   unsigned char buf[sizeof(struct in6_addr)];
372   MaybeStackBuffer<char> ipv6(length + 1);
373   *(*ipv6 + length) = 0;
374   memset(buf, 0, sizeof(buf));
375   memcpy(*ipv6, input, sizeof(const char) * length);
376 
377   int ret = uv_inet_pton(AF_INET6, *ipv6, buf);
378 
379   if (ret != 0) {
380     return;
381   }
382 
383   // Ref: https://sourceware.org/git/?p=glibc.git;a=blob;f=resolv/inet_ntop.c;h=c4d38c0f951013e51a4fc6eaa8a9b82e146abe5a;hb=HEAD#l119
384   for (int i = 0; i < NS_IN6ADDRSZ; i += 2) {
385     value_.ipv6[i >> 1] = (buf[i] << 8) | buf[i + 1];
386   }
387 
388   type_ = HostType::H_IPV6;
389 }
390 
ParseNumber(const char * start,const char * end)391 int64_t ParseNumber(const char* start, const char* end) {
392   unsigned R = 10;
393   if (end - start >= 2 && start[0] == '0' && (start[1] | 0x20) == 'x') {
394     start += 2;
395     R = 16;
396   }
397   if (end - start == 0) {
398     return 0;
399   } else if (R == 10 && end - start > 1 && start[0] == '0') {
400     start++;
401     R = 8;
402   }
403   const char* p = start;
404 
405   while (p < end) {
406     const char ch = p[0];
407     switch (R) {
408       case 8:
409         if (ch < '0' || ch > '7')
410           return -1;
411         break;
412       case 10:
413         if (!IsASCIIDigit(ch))
414           return -1;
415         break;
416       case 16:
417         if (!IsASCIIHexDigit(ch))
418           return -1;
419         break;
420     }
421     p++;
422   }
423   return strtoll(start, nullptr, R);
424 }
425 
ParseIPv4Host(const char * input,size_t length,bool * is_ipv4)426 void URLHost::ParseIPv4Host(const char* input, size_t length, bool* is_ipv4) {
427   CHECK_EQ(type_, HostType::H_FAILED);
428   *is_ipv4 = false;
429   const char* pointer = input;
430   const char* mark = input;
431   const char* end = pointer + length;
432   int parts = 0;
433   uint32_t val = 0;
434   uint64_t numbers[4];
435   int tooBigNumbers = 0;
436   if (length == 0)
437     return;
438 
439   while (pointer <= end) {
440     const char ch = pointer < end ? pointer[0] : kEOL;
441     int64_t remaining = end - pointer - 1;
442     if (ch == '.' || ch == kEOL) {
443       if (++parts > static_cast<int>(arraysize(numbers)))
444         return;
445       if (pointer == mark)
446         return;
447       int64_t n = ParseNumber(mark, pointer);
448       if (n < 0)
449         return;
450 
451       if (n > 255) {
452         tooBigNumbers++;
453       }
454       numbers[parts - 1] = n;
455       mark = pointer + 1;
456       if (ch == '.' && remaining == 0)
457         break;
458     }
459     pointer++;
460   }
461   CHECK_GT(parts, 0);
462   *is_ipv4 = true;
463 
464   // If any but the last item in numbers is greater than 255, return failure.
465   // If the last item in numbers is greater than or equal to
466   // 256^(5 - the number of items in numbers), return failure.
467   if (tooBigNumbers > 1 ||
468       (tooBigNumbers == 1 && numbers[parts - 1] <= 255) ||
469       numbers[parts - 1] >= pow(256, static_cast<double>(5 - parts))) {
470     return;
471   }
472 
473   type_ = HostType::H_IPV4;
474   val = static_cast<uint32_t>(numbers[parts - 1]);
475   for (int n = 0; n < parts - 1; n++) {
476     double b = 3 - n;
477     val +=
478         static_cast<uint32_t>(numbers[n]) * static_cast<uint32_t>(pow(256, b));
479   }
480 
481   value_.ipv4 = val;
482 }
483 
ParseOpaqueHost(const char * input,size_t length)484 void URLHost::ParseOpaqueHost(const char* input, size_t length) {
485   CHECK_EQ(type_, HostType::H_FAILED);
486   std::string output;
487   output.reserve(length);
488   for (size_t i = 0; i < length; i++) {
489     const char ch = input[i];
490     if (ch != '%' && IsForbiddenHostCodePoint(ch)) {
491       return;
492     } else {
493       AppendOrEscape(&output, ch, C0_CONTROL_ENCODE_SET);
494     }
495   }
496 
497   SetOpaque(std::move(output));
498 }
499 
ParseHost(const char * input,size_t length,bool is_special,bool unicode)500 void URLHost::ParseHost(const char* input,
501                         size_t length,
502                         bool is_special,
503                         bool unicode) {
504   CHECK_EQ(type_, HostType::H_FAILED);
505   const char* pointer = input;
506 
507   if (length == 0)
508     return;
509 
510   if (pointer[0] == '[') {
511     if (pointer[length - 1] != ']')
512       return;
513     return ParseIPv6Host(++pointer, length - 2);
514   }
515 
516   if (!is_special)
517     return ParseOpaqueHost(input, length);
518 
519   // First, we have to percent decode
520   std::string decoded = PercentDecode(input, length);
521 
522   // Then we have to punycode toASCII
523   if (!ToASCII(decoded, &decoded))
524     return;
525 
526   // If any of the following characters are still present, we have to fail
527   for (size_t n = 0; n < decoded.size(); n++) {
528     const char ch = decoded[n];
529     if (IsForbiddenHostCodePoint(ch)) {
530       return;
531     }
532   }
533 
534   // Check to see if it's an IPv4 IP address
535   bool is_ipv4;
536   ParseIPv4Host(decoded.c_str(), decoded.length(), &is_ipv4);
537   if (is_ipv4)
538     return;
539 
540   // If the unicode flag is set, run the result through punycode ToUnicode
541   if (unicode && !ToUnicode(decoded, &decoded))
542     return;
543 
544   // It's not an IPv4 or IPv6 address, it must be a domain
545   SetDomain(std::move(decoded));
546 }
547 
548 // Locates the longest sequence of 0 segments in an IPv6 address
549 // in order to use the :: compression when serializing
550 template <typename T>
FindLongestZeroSequence(T * values,size_t len)551 T* FindLongestZeroSequence(T* values, size_t len) {
552   T* start = values;
553   T* end = start + len;
554   T* result = nullptr;
555 
556   T* current = nullptr;
557   unsigned counter = 0, longest = 1;
558 
559   while (start < end) {
560     if (*start == 0) {
561       if (current == nullptr)
562         current = start;
563       counter++;
564     } else {
565       if (counter > longest) {
566         longest = counter;
567         result = current;
568       }
569       counter = 0;
570       current = nullptr;
571     }
572     start++;
573   }
574   if (counter > longest)
575     result = current;
576   return result;
577 }
578 
ToStringMove()579 std::string URLHost::ToStringMove() {
580   std::string return_value;
581   switch (type_) {
582     case HostType::H_DOMAIN:
583     case HostType::H_OPAQUE:
584       return_value = std::move(value_.domain_or_opaque);
585       break;
586     default:
587       return_value = ToString();
588       break;
589   }
590   Reset();
591   return return_value;
592 }
593 
ToString() const594 std::string URLHost::ToString() const {
595   std::string dest;
596   switch (type_) {
597     case HostType::H_DOMAIN:
598     case HostType::H_OPAQUE:
599       return value_.domain_or_opaque;
600       break;
601     case HostType::H_IPV4: {
602       dest.reserve(15);
603       uint32_t value = value_.ipv4;
604       for (int n = 0; n < 4; n++) {
605         char buf[4];
606         snprintf(buf, sizeof(buf), "%d", value % 256);
607         dest.insert(0, buf);
608         if (n < 3)
609           dest.insert(0, 1, '.');
610         value /= 256;
611       }
612       break;
613     }
614     case HostType::H_IPV6: {
615       dest.reserve(41);
616       dest += '[';
617       const uint16_t* start = &value_.ipv6[0];
618       const uint16_t* compress_pointer =
619           FindLongestZeroSequence(start, 8);
620       bool ignore0 = false;
621       for (int n = 0; n <= 7; n++) {
622         const uint16_t* piece = &value_.ipv6[n];
623         if (ignore0 && *piece == 0)
624           continue;
625         else if (ignore0)
626           ignore0 = false;
627         if (compress_pointer == piece) {
628           dest += n == 0 ? "::" : ":";
629           ignore0 = true;
630           continue;
631         }
632         char buf[5];
633         snprintf(buf, sizeof(buf), "%x", *piece);
634         dest += buf;
635         if (n < 7)
636           dest += ':';
637       }
638       dest += ']';
639       break;
640     }
641     case HostType::H_FAILED:
642       break;
643   }
644   return dest;
645 }
646 
ParseHost(const std::string & input,std::string * output,bool is_special,bool unicode=false)647 bool ParseHost(const std::string& input,
648                std::string* output,
649                bool is_special,
650                bool unicode = false) {
651   if (input.empty()) {
652     output->clear();
653     return true;
654   }
655   URLHost host;
656   host.ParseHost(input.c_str(), input.length(), is_special, unicode);
657   if (host.ParsingFailed())
658     return false;
659   *output = host.ToStringMove();
660   return true;
661 }
662 
FromJSStringArray(Environment * env,Local<Array> array)663 std::vector<std::string> FromJSStringArray(Environment* env,
664                                            Local<Array> array) {
665   std::vector<std::string> vec;
666   if (array->Length() > 0)
667     vec.reserve(array->Length());
668   for (size_t n = 0; n < array->Length(); n++) {
669     Local<Value> val = array->Get(env->context(), n).ToLocalChecked();
670     if (val->IsString()) {
671       Utf8Value value(env->isolate(), val.As<String>());
672       vec.emplace_back(*value, value.length());
673     }
674   }
675   return vec;
676 }
677 
HarvestBase(Environment * env,Local<Object> base_obj)678 url_data HarvestBase(Environment* env, Local<Object> base_obj) {
679   url_data base;
680   Local<Context> context = env->context();
681 
682   Local<Value> flags =
683       base_obj->Get(env->context(), env->flags_string()).ToLocalChecked();
684   if (flags->IsInt32())
685     base.flags = flags->Int32Value(context).FromJust();
686 
687   Local<Value> port =
688       base_obj->Get(env->context(), env->port_string()).ToLocalChecked();
689   if (port->IsInt32())
690     base.port = port->Int32Value(context).FromJust();
691 
692   Local<Value> scheme =
693       base_obj->Get(env->context(), env->scheme_string()).ToLocalChecked();
694   base.scheme = Utf8Value(env->isolate(), scheme).out();
695 
696   auto GetStr = [&](std::string url_data::*member,
697                     int flag,
698                     Local<String> name,
699                     bool empty_as_present) {
700     Local<Value> value = base_obj->Get(env->context(), name).ToLocalChecked();
701     if (value->IsString()) {
702       Utf8Value utf8value(env->isolate(), value.As<String>());
703       (base.*member).assign(*utf8value, utf8value.length());
704       if (empty_as_present || value.As<String>()->Length() != 0) {
705         base.flags |= flag;
706       }
707     }
708   };
709   GetStr(&url_data::username,
710          URL_FLAGS_HAS_USERNAME,
711          env->username_string(),
712          false);
713   GetStr(&url_data::password,
714          URL_FLAGS_HAS_PASSWORD,
715          env->password_string(),
716          false);
717   GetStr(&url_data::host, URL_FLAGS_HAS_HOST, env->host_string(), true);
718   GetStr(&url_data::query, URL_FLAGS_HAS_QUERY, env->query_string(), true);
719   GetStr(&url_data::fragment,
720          URL_FLAGS_HAS_FRAGMENT,
721          env->fragment_string(),
722          true);
723 
724   Local<Value>
725       path = base_obj->Get(env->context(), env->path_string()).ToLocalChecked();
726   if (path->IsArray()) {
727     base.flags |= URL_FLAGS_HAS_PATH;
728     base.path = FromJSStringArray(env, path.As<Array>());
729   }
730   return base;
731 }
732 
HarvestContext(Environment * env,Local<Object> context_obj)733 url_data HarvestContext(Environment* env, Local<Object> context_obj) {
734   url_data context;
735   Local<Value> flags =
736       context_obj->Get(env->context(), env->flags_string()).ToLocalChecked();
737   if (flags->IsInt32()) {
738     static constexpr int32_t kCopyFlagsMask =
739         URL_FLAGS_SPECIAL |
740         URL_FLAGS_CANNOT_BE_BASE |
741         URL_FLAGS_HAS_USERNAME |
742         URL_FLAGS_HAS_PASSWORD |
743         URL_FLAGS_HAS_HOST;
744     context.flags |= flags.As<Int32>()->Value() & kCopyFlagsMask;
745   }
746   Local<Value> scheme =
747       context_obj->Get(env->context(), env->scheme_string()).ToLocalChecked();
748   if (scheme->IsString()) {
749     Utf8Value value(env->isolate(), scheme);
750     context.scheme.assign(*value, value.length());
751   }
752   Local<Value> port =
753       context_obj->Get(env->context(), env->port_string()).ToLocalChecked();
754   if (port->IsInt32())
755     context.port = port.As<Int32>()->Value();
756   if (context.flags & URL_FLAGS_HAS_USERNAME) {
757     Local<Value> username =
758         context_obj->Get(env->context(),
759                          env->username_string()).ToLocalChecked();
760     CHECK(username->IsString());
761     Utf8Value value(env->isolate(), username);
762     context.username.assign(*value, value.length());
763   }
764   if (context.flags & URL_FLAGS_HAS_PASSWORD) {
765     Local<Value> password =
766         context_obj->Get(env->context(),
767                          env->password_string()).ToLocalChecked();
768     CHECK(password->IsString());
769     Utf8Value value(env->isolate(), password);
770     context.password.assign(*value, value.length());
771   }
772   Local<Value> host =
773       context_obj->Get(env->context(),
774                        env->host_string()).ToLocalChecked();
775   if (host->IsString()) {
776     Utf8Value value(env->isolate(), host);
777     context.host.assign(*value, value.length());
778   }
779   return context;
780 }
781 
782 // Single dot segment can be ".", "%2e", or "%2E"
IsSingleDotSegment(const std::string & str)783 bool IsSingleDotSegment(const std::string& str) {
784   switch (str.size()) {
785     case 1:
786       return str == ".";
787     case 3:
788       return str[0] == '%' &&
789              str[1] == '2' &&
790              ASCIILowercase(str[2]) == 'e';
791     default:
792       return false;
793   }
794 }
795 
796 // Double dot segment can be:
797 //   "..", ".%2e", ".%2E", "%2e.", "%2E.",
798 //   "%2e%2e", "%2E%2E", "%2e%2E", or "%2E%2e"
IsDoubleDotSegment(const std::string & str)799 bool IsDoubleDotSegment(const std::string& str) {
800   switch (str.size()) {
801     case 2:
802       return str == "..";
803     case 4:
804       if (str[0] != '.' && str[0] != '%')
805         return false;
806       return ((str[0] == '.' &&
807                str[1] == '%' &&
808                str[2] == '2' &&
809                ASCIILowercase(str[3]) == 'e') ||
810               (str[0] == '%' &&
811                str[1] == '2' &&
812                ASCIILowercase(str[2]) == 'e' &&
813                str[3] == '.'));
814     case 6:
815       return (str[0] == '%' &&
816               str[1] == '2' &&
817               ASCIILowercase(str[2]) == 'e' &&
818               str[3] == '%' &&
819               str[4] == '2' &&
820               ASCIILowercase(str[5]) == 'e');
821     default:
822       return false;
823   }
824 }
825 
ShortenUrlPath(struct url_data * url)826 void ShortenUrlPath(struct url_data* url) {
827   if (url->path.empty()) return;
828   if (url->path.size() == 1 && url->scheme == "file:" &&
829       IsNormalizedWindowsDriveLetter(url->path[0])) return;
830   url->path.pop_back();
831 }
832 
833 }  // anonymous namespace
834 
Parse(const char * input,size_t len,enum url_parse_state state_override,struct url_data * url,bool has_url,const struct url_data * base,bool has_base)835 void URL::Parse(const char* input,
836                 size_t len,
837                 enum url_parse_state state_override,
838                 struct url_data* url,
839                 bool has_url,
840                 const struct url_data* base,
841                 bool has_base) {
842   const char* p = input;
843   const char* end = input + len;
844 
845   if (!has_url) {
846     for (const char* ptr = p; ptr < end; ptr++) {
847       if (IsC0ControlOrSpace(*ptr))
848         p++;
849       else
850         break;
851     }
852     for (const char* ptr = end - 1; ptr >= p; ptr--) {
853       if (IsC0ControlOrSpace(*ptr))
854         end--;
855       else
856         break;
857     }
858     input = p;
859     len = end - p;
860   }
861 
862   // The spec says we should strip out any ASCII tabs or newlines.
863   // In those cases, we create another std::string instance with the filtered
864   // contents, but in the general case we avoid the overhead.
865   std::string whitespace_stripped;
866   for (const char* ptr = p; ptr < end; ptr++) {
867     if (!IsASCIITabOrNewline(*ptr))
868       continue;
869     // Hit tab or newline. Allocate storage, copy what we have until now,
870     // and then iterate and filter all similar characters out.
871     whitespace_stripped.reserve(len - 1);
872     whitespace_stripped.assign(p, ptr - p);
873     // 'ptr + 1' skips the current char, which we know to be tab or newline.
874     for (ptr = ptr + 1; ptr < end; ptr++) {
875       if (!IsASCIITabOrNewline(*ptr))
876         whitespace_stripped += *ptr;
877     }
878 
879     // Update variables like they should have looked like if the string
880     // had been stripped of whitespace to begin with.
881     input = whitespace_stripped.c_str();
882     len = whitespace_stripped.size();
883     p = input;
884     end = input + len;
885     break;
886   }
887 
888   bool atflag = false;  // Set when @ has been seen.
889   bool square_bracket_flag = false;  // Set inside of [...]
890   bool password_token_seen_flag = false;  // Set after a : after an username.
891 
892   std::string buffer;
893 
894   // Set the initial parse state.
895   const bool has_state_override = state_override != kUnknownState;
896   enum url_parse_state state = has_state_override ? state_override :
897                                                     kSchemeStart;
898 
899   if (state < kSchemeStart || state > kFragment) {
900     url->flags |= URL_FLAGS_INVALID_PARSE_STATE;
901     return;
902   }
903 
904   while (p <= end) {
905     const char ch = p < end ? p[0] : kEOL;
906     bool special = (url->flags & URL_FLAGS_SPECIAL);
907     bool cannot_be_base;
908     bool special_back_slash = (special && ch == '\\');
909 
910     switch (state) {
911       case kSchemeStart:
912         if (IsASCIIAlpha(ch)) {
913           buffer += ASCIILowercase(ch);
914           state = kScheme;
915         } else if (!has_state_override) {
916           state = kNoScheme;
917           continue;
918         } else {
919           url->flags |= URL_FLAGS_FAILED;
920           return;
921         }
922         break;
923       case kScheme:
924         if (IsASCIIAlphanumeric(ch) || ch == '+' || ch == '-' || ch == '.') {
925           buffer += ASCIILowercase(ch);
926         } else if (ch == ':' || (has_state_override && ch == kEOL)) {
927           if (has_state_override && buffer.size() == 0) {
928             url->flags |= URL_FLAGS_TERMINATED;
929             return;
930           }
931           buffer += ':';
932 
933           bool new_is_special = IsSpecial(buffer);
934 
935           if (has_state_override) {
936             if ((special != new_is_special) ||
937                 ((buffer == "file:") &&
938                  ((url->flags & URL_FLAGS_HAS_USERNAME) ||
939                   (url->flags & URL_FLAGS_HAS_PASSWORD) ||
940                   (url->port != -1)))) {
941               url->flags |= URL_FLAGS_TERMINATED;
942               return;
943             }
944 
945             // File scheme && (host == empty or null) check left to JS-land
946             // as it can be done before even entering C++ binding.
947           }
948 
949           url->scheme = std::move(buffer);
950           url->port = NormalizePort(url->scheme, url->port);
951           if (new_is_special) {
952             url->flags |= URL_FLAGS_SPECIAL;
953             special = true;
954           } else {
955             url->flags &= ~URL_FLAGS_SPECIAL;
956             special = false;
957           }
958           special_back_slash = (special && ch == '\\');
959           buffer.clear();
960           if (has_state_override)
961             return;
962           if (url->scheme == "file:") {
963             state = kFile;
964           } else if (special &&
965                      has_base &&
966                      url->scheme == base->scheme) {
967             state = kSpecialRelativeOrAuthority;
968           } else if (special) {
969             state = kSpecialAuthoritySlashes;
970           } else if (p + 1 < end && p[1] == '/') {
971             state = kPathOrAuthority;
972             p++;
973           } else {
974             url->flags |= URL_FLAGS_CANNOT_BE_BASE;
975             url->flags |= URL_FLAGS_HAS_PATH;
976             url->path.emplace_back("");
977             state = kCannotBeBase;
978           }
979         } else if (!has_state_override) {
980           buffer.clear();
981           state = kNoScheme;
982           p = input;
983           continue;
984         } else {
985           url->flags |= URL_FLAGS_FAILED;
986           return;
987         }
988         break;
989       case kNoScheme:
990         cannot_be_base = has_base && (base->flags & URL_FLAGS_CANNOT_BE_BASE);
991         if (!has_base || (cannot_be_base && ch != '#')) {
992           url->flags |= URL_FLAGS_FAILED;
993           return;
994         } else if (cannot_be_base && ch == '#') {
995           url->scheme = base->scheme;
996           if (IsSpecial(url->scheme)) {
997             url->flags |= URL_FLAGS_SPECIAL;
998             special = true;
999           } else {
1000             url->flags &= ~URL_FLAGS_SPECIAL;
1001             special = false;
1002           }
1003           special_back_slash = (special && ch == '\\');
1004           if (base->flags & URL_FLAGS_HAS_PATH) {
1005             url->flags |= URL_FLAGS_HAS_PATH;
1006             url->path = base->path;
1007           }
1008           if (base->flags & URL_FLAGS_HAS_QUERY) {
1009             url->flags |= URL_FLAGS_HAS_QUERY;
1010             url->query = base->query;
1011           }
1012           if (base->flags & URL_FLAGS_HAS_FRAGMENT) {
1013             url->flags |= URL_FLAGS_HAS_FRAGMENT;
1014             url->fragment = base->fragment;
1015           }
1016           url->flags |= URL_FLAGS_CANNOT_BE_BASE;
1017           state = kFragment;
1018         } else if (has_base &&
1019                    base->scheme != "file:") {
1020           state = kRelative;
1021           continue;
1022         } else {
1023           url->scheme = "file:";
1024           url->flags |= URL_FLAGS_SPECIAL;
1025           special = true;
1026           state = kFile;
1027           special_back_slash = (special && ch == '\\');
1028           continue;
1029         }
1030         break;
1031       case kSpecialRelativeOrAuthority:
1032         if (ch == '/' && p + 1 < end && p[1] == '/') {
1033           state = kSpecialAuthorityIgnoreSlashes;
1034           p++;
1035         } else {
1036           state = kRelative;
1037           continue;
1038         }
1039         break;
1040       case kPathOrAuthority:
1041         if (ch == '/') {
1042           state = kAuthority;
1043         } else {
1044           state = kPath;
1045           continue;
1046         }
1047         break;
1048       case kRelative:
1049         url->scheme = base->scheme;
1050         if (IsSpecial(url->scheme)) {
1051           url->flags |= URL_FLAGS_SPECIAL;
1052           special = true;
1053         } else {
1054           url->flags &= ~URL_FLAGS_SPECIAL;
1055           special = false;
1056         }
1057         special_back_slash = (special && ch == '\\');
1058         switch (ch) {
1059           case kEOL:
1060             if (base->flags & URL_FLAGS_HAS_USERNAME) {
1061               url->flags |= URL_FLAGS_HAS_USERNAME;
1062               url->username = base->username;
1063             }
1064             if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1065               url->flags |= URL_FLAGS_HAS_PASSWORD;
1066               url->password = base->password;
1067             }
1068             if (base->flags & URL_FLAGS_HAS_HOST) {
1069               url->flags |= URL_FLAGS_HAS_HOST;
1070               url->host = base->host;
1071             }
1072             if (base->flags & URL_FLAGS_HAS_QUERY) {
1073               url->flags |= URL_FLAGS_HAS_QUERY;
1074               url->query = base->query;
1075             }
1076             if (base->flags & URL_FLAGS_HAS_PATH) {
1077               url->flags |= URL_FLAGS_HAS_PATH;
1078               url->path = base->path;
1079             }
1080             url->port = base->port;
1081             break;
1082           case '/':
1083             state = kRelativeSlash;
1084             break;
1085           case '?':
1086             if (base->flags & URL_FLAGS_HAS_USERNAME) {
1087               url->flags |= URL_FLAGS_HAS_USERNAME;
1088               url->username = base->username;
1089             }
1090             if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1091               url->flags |= URL_FLAGS_HAS_PASSWORD;
1092               url->password = base->password;
1093             }
1094             if (base->flags & URL_FLAGS_HAS_HOST) {
1095               url->flags |= URL_FLAGS_HAS_HOST;
1096               url->host = base->host;
1097             }
1098             if (base->flags & URL_FLAGS_HAS_PATH) {
1099               url->flags |= URL_FLAGS_HAS_PATH;
1100               url->path = base->path;
1101             }
1102             url->port = base->port;
1103             state = kQuery;
1104             break;
1105           case '#':
1106             if (base->flags & URL_FLAGS_HAS_USERNAME) {
1107               url->flags |= URL_FLAGS_HAS_USERNAME;
1108               url->username = base->username;
1109             }
1110             if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1111               url->flags |= URL_FLAGS_HAS_PASSWORD;
1112               url->password = base->password;
1113             }
1114             if (base->flags & URL_FLAGS_HAS_HOST) {
1115               url->flags |= URL_FLAGS_HAS_HOST;
1116               url->host = base->host;
1117             }
1118             if (base->flags & URL_FLAGS_HAS_QUERY) {
1119               url->flags |= URL_FLAGS_HAS_QUERY;
1120               url->query = base->query;
1121             }
1122             if (base->flags & URL_FLAGS_HAS_PATH) {
1123               url->flags |= URL_FLAGS_HAS_PATH;
1124               url->path = base->path;
1125             }
1126             url->port = base->port;
1127             state = kFragment;
1128             break;
1129           default:
1130             if (special_back_slash) {
1131               state = kRelativeSlash;
1132             } else {
1133               if (base->flags & URL_FLAGS_HAS_USERNAME) {
1134                 url->flags |= URL_FLAGS_HAS_USERNAME;
1135                 url->username = base->username;
1136               }
1137               if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1138                 url->flags |= URL_FLAGS_HAS_PASSWORD;
1139                 url->password = base->password;
1140               }
1141               if (base->flags & URL_FLAGS_HAS_HOST) {
1142                 url->flags |= URL_FLAGS_HAS_HOST;
1143                 url->host = base->host;
1144               }
1145               if (base->flags & URL_FLAGS_HAS_PATH) {
1146                 url->flags |= URL_FLAGS_HAS_PATH;
1147                 url->path = base->path;
1148                 ShortenUrlPath(url);
1149               }
1150               url->port = base->port;
1151               state = kPath;
1152               continue;
1153             }
1154         }
1155         break;
1156       case kRelativeSlash:
1157         if (IsSpecial(url->scheme) && (ch == '/' || ch == '\\')) {
1158           state = kSpecialAuthorityIgnoreSlashes;
1159         } else if (ch == '/') {
1160           state = kAuthority;
1161         } else {
1162           if (base->flags & URL_FLAGS_HAS_USERNAME) {
1163             url->flags |= URL_FLAGS_HAS_USERNAME;
1164             url->username = base->username;
1165           }
1166           if (base->flags & URL_FLAGS_HAS_PASSWORD) {
1167             url->flags |= URL_FLAGS_HAS_PASSWORD;
1168             url->password = base->password;
1169           }
1170           if (base->flags & URL_FLAGS_HAS_HOST) {
1171             url->flags |= URL_FLAGS_HAS_HOST;
1172             url->host = base->host;
1173           }
1174           url->port = base->port;
1175           state = kPath;
1176           continue;
1177         }
1178         break;
1179       case kSpecialAuthoritySlashes:
1180         state = kSpecialAuthorityIgnoreSlashes;
1181         if (ch == '/' && p + 1 < end && p[1] == '/') {
1182           p++;
1183         } else {
1184           continue;
1185         }
1186         break;
1187       case kSpecialAuthorityIgnoreSlashes:
1188         if (ch != '/' && ch != '\\') {
1189           state = kAuthority;
1190           continue;
1191         }
1192         break;
1193       case kAuthority:
1194         if (ch == '@') {
1195           if (atflag) {
1196             buffer.reserve(buffer.size() + 3);
1197             buffer.insert(0, "%40");
1198           }
1199           atflag = true;
1200           size_t blen = buffer.size();
1201           if (blen > 0 && buffer[0] != ':') {
1202             url->flags |= URL_FLAGS_HAS_USERNAME;
1203           }
1204           for (size_t n = 0; n < blen; n++) {
1205             const char bch = buffer[n];
1206             if (bch == ':') {
1207               url->flags |= URL_FLAGS_HAS_PASSWORD;
1208               if (!password_token_seen_flag) {
1209                 password_token_seen_flag = true;
1210                 continue;
1211               }
1212             }
1213             if (password_token_seen_flag) {
1214               AppendOrEscape(&url->password, bch, USERINFO_ENCODE_SET);
1215             } else {
1216               AppendOrEscape(&url->username, bch, USERINFO_ENCODE_SET);
1217             }
1218           }
1219           buffer.clear();
1220         } else if (ch == kEOL ||
1221                    ch == '/' ||
1222                    ch == '?' ||
1223                    ch == '#' ||
1224                    special_back_slash) {
1225           if (atflag && buffer.size() == 0) {
1226             url->flags |= URL_FLAGS_FAILED;
1227             return;
1228           }
1229           p -= buffer.size() + 1;
1230           buffer.clear();
1231           state = kHost;
1232         } else {
1233           buffer += ch;
1234         }
1235         break;
1236       case kHost:
1237       case kHostname:
1238         if (has_state_override && url->scheme == "file:") {
1239           state = kFileHost;
1240           continue;
1241         } else if (ch == ':' && !square_bracket_flag) {
1242           if (buffer.size() == 0) {
1243             url->flags |= URL_FLAGS_FAILED;
1244             return;
1245           }
1246           url->flags |= URL_FLAGS_HAS_HOST;
1247           if (!ParseHost(buffer, &url->host, special)) {
1248             url->flags |= URL_FLAGS_FAILED;
1249             return;
1250           }
1251           buffer.clear();
1252           state = kPort;
1253           if (state_override == kHostname) {
1254             return;
1255           }
1256         } else if (ch == kEOL ||
1257                    ch == '/' ||
1258                    ch == '?' ||
1259                    ch == '#' ||
1260                    special_back_slash) {
1261           p--;
1262           if (special && buffer.size() == 0) {
1263             url->flags |= URL_FLAGS_FAILED;
1264             return;
1265           }
1266           if (has_state_override &&
1267               buffer.size() == 0 &&
1268               ((url->username.size() > 0 || url->password.size() > 0) ||
1269                url->port != -1)) {
1270             url->flags |= URL_FLAGS_TERMINATED;
1271             return;
1272           }
1273           url->flags |= URL_FLAGS_HAS_HOST;
1274           if (!ParseHost(buffer, &url->host, special)) {
1275             url->flags |= URL_FLAGS_FAILED;
1276             return;
1277           }
1278           buffer.clear();
1279           state = kPathStart;
1280           if (has_state_override) {
1281             return;
1282           }
1283         } else {
1284           if (ch == '[')
1285             square_bracket_flag = true;
1286           if (ch == ']')
1287             square_bracket_flag = false;
1288           buffer += ch;
1289         }
1290         break;
1291       case kPort:
1292         if (IsASCIIDigit(ch)) {
1293           buffer += ch;
1294         } else if (has_state_override ||
1295                    ch == kEOL ||
1296                    ch == '/' ||
1297                    ch == '?' ||
1298                    ch == '#' ||
1299                    special_back_slash) {
1300           if (buffer.size() > 0) {
1301             unsigned port = 0;
1302             // the condition port <= 0xffff prevents integer overflow
1303             for (size_t i = 0; port <= 0xffff && i < buffer.size(); i++)
1304               port = port * 10 + buffer[i] - '0';
1305             if (port > 0xffff) {
1306               // TODO(TimothyGu): This hack is currently needed for the host
1307               // setter since it needs access to hostname if it is valid, and
1308               // if the FAILED flag is set the entire response to JS layer
1309               // will be empty.
1310               if (state_override == kHost)
1311                 url->port = -1;
1312               else
1313                 url->flags |= URL_FLAGS_FAILED;
1314               return;
1315             }
1316             // the port is valid
1317             url->port = NormalizePort(url->scheme, static_cast<int>(port));
1318             if (url->port == -1)
1319               url->flags |= URL_FLAGS_IS_DEFAULT_SCHEME_PORT;
1320             buffer.clear();
1321           } else if (has_state_override) {
1322             // TODO(TimothyGu): Similar case as above.
1323             if (state_override == kHost)
1324               url->port = -1;
1325             else
1326               url->flags |= URL_FLAGS_TERMINATED;
1327             return;
1328           }
1329           state = kPathStart;
1330           continue;
1331         } else {
1332           url->flags |= URL_FLAGS_FAILED;
1333           return;
1334         }
1335         break;
1336       case kFile:
1337         url->scheme = "file:";
1338         if (ch == '/' || ch == '\\') {
1339           state = kFileSlash;
1340         } else if (has_base && base->scheme == "file:") {
1341           switch (ch) {
1342             case kEOL:
1343               if (base->flags & URL_FLAGS_HAS_HOST) {
1344                 url->flags |= URL_FLAGS_HAS_HOST;
1345                 url->host = base->host;
1346               }
1347               if (base->flags & URL_FLAGS_HAS_PATH) {
1348                 url->flags |= URL_FLAGS_HAS_PATH;
1349                 url->path = base->path;
1350               }
1351               if (base->flags & URL_FLAGS_HAS_QUERY) {
1352                 url->flags |= URL_FLAGS_HAS_QUERY;
1353                 url->query = base->query;
1354               }
1355               break;
1356             case '?':
1357               if (base->flags & URL_FLAGS_HAS_HOST) {
1358                 url->flags |= URL_FLAGS_HAS_HOST;
1359                 url->host = base->host;
1360               }
1361               if (base->flags & URL_FLAGS_HAS_PATH) {
1362                 url->flags |= URL_FLAGS_HAS_PATH;
1363                 url->path = base->path;
1364               }
1365               url->flags |= URL_FLAGS_HAS_QUERY;
1366               url->query.clear();
1367               state = kQuery;
1368               break;
1369             case '#':
1370               if (base->flags & URL_FLAGS_HAS_HOST) {
1371                 url->flags |= URL_FLAGS_HAS_HOST;
1372                 url->host = base->host;
1373               }
1374               if (base->flags & URL_FLAGS_HAS_PATH) {
1375                 url->flags |= URL_FLAGS_HAS_PATH;
1376                 url->path = base->path;
1377               }
1378               if (base->flags & URL_FLAGS_HAS_QUERY) {
1379                 url->flags |= URL_FLAGS_HAS_QUERY;
1380                 url->query = base->query;
1381               }
1382               url->flags |= URL_FLAGS_HAS_FRAGMENT;
1383               url->fragment.clear();
1384               state = kFragment;
1385               break;
1386             default:
1387               if (!StartsWithWindowsDriveLetter(p, end)) {
1388                 if (base->flags & URL_FLAGS_HAS_HOST) {
1389                   url->flags |= URL_FLAGS_HAS_HOST;
1390                   url->host = base->host;
1391                 }
1392                 if (base->flags & URL_FLAGS_HAS_PATH) {
1393                   url->flags |= URL_FLAGS_HAS_PATH;
1394                   url->path = base->path;
1395                 }
1396                 ShortenUrlPath(url);
1397               }
1398               state = kPath;
1399               continue;
1400           }
1401         } else {
1402           state = kPath;
1403           continue;
1404         }
1405         break;
1406       case kFileSlash:
1407         if (ch == '/' || ch == '\\') {
1408           state = kFileHost;
1409         } else {
1410           if (has_base &&
1411               base->scheme == "file:" &&
1412               !StartsWithWindowsDriveLetter(p, end)) {
1413             if (IsNormalizedWindowsDriveLetter(base->path[0])) {
1414               url->flags |= URL_FLAGS_HAS_PATH;
1415               url->path.push_back(base->path[0]);
1416             } else {
1417               if (base->flags & URL_FLAGS_HAS_HOST) {
1418                 url->flags |= URL_FLAGS_HAS_HOST;
1419                 url->host = base->host;
1420               } else {
1421                 url->flags &= ~URL_FLAGS_HAS_HOST;
1422                 url->host.clear();
1423               }
1424             }
1425           }
1426           state = kPath;
1427           continue;
1428         }
1429         break;
1430       case kFileHost:
1431         if (ch == kEOL ||
1432             ch == '/' ||
1433             ch == '\\' ||
1434             ch == '?' ||
1435             ch == '#') {
1436           if (!has_state_override &&
1437               buffer.size() == 2 &&
1438               IsWindowsDriveLetter(buffer)) {
1439             state = kPath;
1440           } else if (buffer.size() == 0) {
1441             url->flags |= URL_FLAGS_HAS_HOST;
1442             url->host.clear();
1443             if (has_state_override)
1444               return;
1445             state = kPathStart;
1446           } else {
1447             std::string host;
1448             if (!ParseHost(buffer, &host, special)) {
1449               url->flags |= URL_FLAGS_FAILED;
1450               return;
1451             }
1452             if (host == "localhost")
1453               host.clear();
1454             url->flags |= URL_FLAGS_HAS_HOST;
1455             url->host = host;
1456             if (has_state_override)
1457               return;
1458             buffer.clear();
1459             state = kPathStart;
1460           }
1461           continue;
1462         } else {
1463           buffer += ch;
1464         }
1465         break;
1466       case kPathStart:
1467         if (IsSpecial(url->scheme)) {
1468           state = kPath;
1469           if (ch != '/' && ch != '\\') {
1470             continue;
1471           }
1472         } else if (!has_state_override && ch == '?') {
1473           url->flags |= URL_FLAGS_HAS_QUERY;
1474           url->query.clear();
1475           state = kQuery;
1476         } else if (!has_state_override && ch == '#') {
1477           url->flags |= URL_FLAGS_HAS_FRAGMENT;
1478           url->fragment.clear();
1479           state = kFragment;
1480         } else if (ch != kEOL) {
1481           state = kPath;
1482           if (ch != '/') {
1483             continue;
1484           }
1485         }
1486         break;
1487       case kPath:
1488         if (ch == kEOL ||
1489             ch == '/' ||
1490             special_back_slash ||
1491             (!has_state_override && (ch == '?' || ch == '#'))) {
1492           if (IsDoubleDotSegment(buffer)) {
1493             ShortenUrlPath(url);
1494             if (ch != '/' && !special_back_slash) {
1495               url->flags |= URL_FLAGS_HAS_PATH;
1496               url->path.emplace_back("");
1497             }
1498           } else if (IsSingleDotSegment(buffer) &&
1499                      ch != '/' && !special_back_slash) {
1500             url->flags |= URL_FLAGS_HAS_PATH;
1501             url->path.emplace_back("");
1502           } else if (!IsSingleDotSegment(buffer)) {
1503             if (url->scheme == "file:" &&
1504                 url->path.empty() &&
1505                 buffer.size() == 2 &&
1506                 IsWindowsDriveLetter(buffer)) {
1507               if ((url->flags & URL_FLAGS_HAS_HOST) &&
1508                   !url->host.empty()) {
1509                 url->host.clear();
1510                 url->flags |= URL_FLAGS_HAS_HOST;
1511               }
1512               buffer[1] = ':';
1513             }
1514             url->flags |= URL_FLAGS_HAS_PATH;
1515             url->path.emplace_back(std::move(buffer));
1516           }
1517           buffer.clear();
1518           if (url->scheme == "file:" &&
1519               (ch == kEOL ||
1520                ch == '?' ||
1521                ch == '#')) {
1522             while (url->path.size() > 1 && url->path[0].empty()) {
1523               url->path.erase(url->path.begin());
1524             }
1525           }
1526           if (ch == '?') {
1527             url->flags |= URL_FLAGS_HAS_QUERY;
1528             state = kQuery;
1529           } else if (ch == '#') {
1530             state = kFragment;
1531           }
1532         } else {
1533           AppendOrEscape(&buffer, ch, PATH_ENCODE_SET);
1534         }
1535         break;
1536       case kCannotBeBase:
1537         switch (ch) {
1538           case '?':
1539             state = kQuery;
1540             break;
1541           case '#':
1542             state = kFragment;
1543             break;
1544           default:
1545             if (url->path.empty())
1546               url->path.emplace_back("");
1547             else if (ch != kEOL)
1548               AppendOrEscape(&url->path[0], ch, C0_CONTROL_ENCODE_SET);
1549         }
1550         break;
1551       case kQuery:
1552         if (ch == kEOL || (!has_state_override && ch == '#')) {
1553           url->flags |= URL_FLAGS_HAS_QUERY;
1554           url->query = std::move(buffer);
1555           buffer.clear();
1556           if (ch == '#')
1557             state = kFragment;
1558         } else {
1559           AppendOrEscape(&buffer, ch, special ? QUERY_ENCODE_SET_SPECIAL :
1560                                                 QUERY_ENCODE_SET_NONSPECIAL);
1561         }
1562         break;
1563       case kFragment:
1564         switch (ch) {
1565           case kEOL:
1566             url->flags |= URL_FLAGS_HAS_FRAGMENT;
1567             url->fragment = std::move(buffer);
1568             break;
1569           case 0:
1570             break;
1571           default:
1572             AppendOrEscape(&buffer, ch, FRAGMENT_ENCODE_SET);
1573         }
1574         break;
1575       default:
1576         url->flags |= URL_FLAGS_INVALID_PARSE_STATE;
1577         return;
1578     }
1579 
1580     p++;
1581   }
1582 }  // NOLINT(readability/fn_size)
1583 
1584 namespace {
SetArgs(Environment * env,Local<Value> argv[ARG_COUNT],const struct url_data & url)1585 void SetArgs(Environment* env,
1586              Local<Value> argv[ARG_COUNT],
1587              const struct url_data& url) {
1588   Isolate* isolate = env->isolate();
1589   argv[ARG_FLAGS] = Integer::NewFromUnsigned(isolate, url.flags);
1590   argv[ARG_PROTOCOL] =
1591       url.flags & URL_FLAGS_SPECIAL ?
1592           GetSpecial(env, url.scheme) :
1593           OneByteString(isolate, url.scheme.c_str());
1594   if (url.flags & URL_FLAGS_HAS_USERNAME)
1595     argv[ARG_USERNAME] = Utf8String(isolate, url.username);
1596   if (url.flags & URL_FLAGS_HAS_PASSWORD)
1597     argv[ARG_PASSWORD] = Utf8String(isolate, url.password);
1598   if (url.flags & URL_FLAGS_HAS_HOST)
1599     argv[ARG_HOST] = Utf8String(isolate, url.host);
1600   if (url.flags & URL_FLAGS_HAS_QUERY)
1601     argv[ARG_QUERY] = Utf8String(isolate, url.query);
1602   if (url.flags & URL_FLAGS_HAS_FRAGMENT)
1603     argv[ARG_FRAGMENT] = Utf8String(isolate, url.fragment);
1604   if (url.port > -1)
1605     argv[ARG_PORT] = Integer::New(isolate, url.port);
1606   if (url.flags & URL_FLAGS_HAS_PATH)
1607     argv[ARG_PATH] = ToV8Value(env->context(), url.path).ToLocalChecked();
1608 }
1609 
Parse(Environment * env,Local<Value> recv,const char * input,size_t len,enum url_parse_state state_override,Local<Value> base_obj,Local<Value> context_obj,Local<Function> cb,Local<Value> error_cb)1610 void Parse(Environment* env,
1611            Local<Value> recv,
1612            const char* input,
1613            size_t len,
1614            enum url_parse_state state_override,
1615            Local<Value> base_obj,
1616            Local<Value> context_obj,
1617            Local<Function> cb,
1618            Local<Value> error_cb) {
1619   Isolate* isolate = env->isolate();
1620   Local<Context> context = env->context();
1621   HandleScope handle_scope(isolate);
1622   Context::Scope context_scope(context);
1623 
1624   const bool has_context = context_obj->IsObject();
1625   const bool has_base = base_obj->IsObject();
1626 
1627   url_data base;
1628   url_data url;
1629   if (has_context)
1630     url = HarvestContext(env, context_obj.As<Object>());
1631   if (has_base)
1632     base = HarvestBase(env, base_obj.As<Object>());
1633 
1634   URL::Parse(input, len, state_override, &url, has_context, &base, has_base);
1635   if ((url.flags & URL_FLAGS_INVALID_PARSE_STATE) ||
1636       ((state_override != kUnknownState) &&
1637        (url.flags & URL_FLAGS_TERMINATED)))
1638     return;
1639 
1640   // Define the return value placeholders
1641   const Local<Value> undef = Undefined(isolate);
1642   const Local<Value> null = Null(isolate);
1643   if (!(url.flags & URL_FLAGS_FAILED)) {
1644     Local<Value> argv[] = {
1645       undef,
1646       undef,
1647       undef,
1648       undef,
1649       null,  // host defaults to null
1650       null,  // port defaults to null
1651       undef,
1652       null,  // query defaults to null
1653       null,  // fragment defaults to null
1654     };
1655     SetArgs(env, argv, url);
1656     cb->Call(context, recv, arraysize(argv), argv).FromMaybe(Local<Value>());
1657   } else if (error_cb->IsFunction()) {
1658     Local<Value> argv[2] = { undef, undef };
1659     argv[ERR_ARG_FLAGS] = Integer::NewFromUnsigned(isolate, url.flags);
1660     argv[ERR_ARG_INPUT] =
1661       String::NewFromUtf8(env->isolate(), input).ToLocalChecked();
1662     error_cb.As<Function>()->Call(context, recv, arraysize(argv), argv)
1663         .FromMaybe(Local<Value>());
1664   }
1665 }
1666 
Parse(const FunctionCallbackInfo<Value> & args)1667 void Parse(const FunctionCallbackInfo<Value>& args) {
1668   Environment* env = Environment::GetCurrent(args);
1669   CHECK_GE(args.Length(), 5);
1670   CHECK(args[0]->IsString());  // input
1671   CHECK(args[2]->IsUndefined() ||  // base context
1672         args[2]->IsNull() ||
1673         args[2]->IsObject());
1674   CHECK(args[3]->IsUndefined() ||  // context
1675         args[3]->IsNull() ||
1676         args[3]->IsObject());
1677   CHECK(args[4]->IsFunction());  // complete callback
1678   CHECK(args[5]->IsUndefined() || args[5]->IsFunction());  // error callback
1679 
1680   Utf8Value input(env->isolate(), args[0]);
1681   enum url_parse_state state_override = kUnknownState;
1682   if (args[1]->IsNumber()) {
1683     state_override = static_cast<enum url_parse_state>(
1684         args[1]->Uint32Value(env->context()).FromJust());
1685   }
1686 
1687   Parse(env, args.This(),
1688         *input, input.length(),
1689         state_override,
1690         args[2],
1691         args[3],
1692         args[4].As<Function>(),
1693         args[5]);
1694 }
1695 
EncodeAuthSet(const FunctionCallbackInfo<Value> & args)1696 void EncodeAuthSet(const FunctionCallbackInfo<Value>& args) {
1697   Environment* env = Environment::GetCurrent(args);
1698   CHECK_GE(args.Length(), 1);
1699   CHECK(args[0]->IsString());
1700   Utf8Value value(env->isolate(), args[0]);
1701   std::string output;
1702   size_t len = value.length();
1703   output.reserve(len);
1704   for (size_t n = 0; n < len; n++) {
1705     const char ch = (*value)[n];
1706     AppendOrEscape(&output, ch, USERINFO_ENCODE_SET);
1707   }
1708   args.GetReturnValue().Set(
1709       String::NewFromUtf8(env->isolate(), output.c_str()).ToLocalChecked());
1710 }
1711 
ToUSVString(const FunctionCallbackInfo<Value> & args)1712 void ToUSVString(const FunctionCallbackInfo<Value>& args) {
1713   Environment* env = Environment::GetCurrent(args);
1714   CHECK_GE(args.Length(), 2);
1715   CHECK(args[0]->IsString());
1716   CHECK(args[1]->IsNumber());
1717 
1718   TwoByteValue value(env->isolate(), args[0]);
1719 
1720   int64_t start = args[1]->IntegerValue(env->context()).FromJust();
1721   CHECK_GE(start, 0);
1722 
1723   for (size_t i = start; i < value.length(); i++) {
1724     char16_t c = value[i];
1725     if (!IsUnicodeSurrogate(c)) {
1726       continue;
1727     } else if (IsUnicodeSurrogateTrail(c) || i == value.length() - 1) {
1728       value[i] = kUnicodeReplacementCharacter;
1729     } else {
1730       char16_t d = value[i + 1];
1731       if (IsUnicodeTrail(d)) {
1732         i++;
1733       } else {
1734         value[i] = kUnicodeReplacementCharacter;
1735       }
1736     }
1737   }
1738 
1739   args.GetReturnValue().Set(
1740       String::NewFromTwoByte(env->isolate(),
1741                              *value,
1742                              NewStringType::kNormal,
1743                              value.length()).ToLocalChecked());
1744 }
1745 
DomainToASCII(const FunctionCallbackInfo<Value> & args)1746 void DomainToASCII(const FunctionCallbackInfo<Value>& args) {
1747   Environment* env = Environment::GetCurrent(args);
1748   CHECK_GE(args.Length(), 1);
1749   CHECK(args[0]->IsString());
1750   Utf8Value value(env->isolate(), args[0]);
1751 
1752   URLHost host;
1753   // Assuming the host is used for a special scheme.
1754   host.ParseHost(*value, value.length(), true);
1755   if (host.ParsingFailed()) {
1756     args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), ""));
1757     return;
1758   }
1759   std::string out = host.ToStringMove();
1760   args.GetReturnValue().Set(
1761       String::NewFromUtf8(env->isolate(), out.c_str()).ToLocalChecked());
1762 }
1763 
DomainToUnicode(const FunctionCallbackInfo<Value> & args)1764 void DomainToUnicode(const FunctionCallbackInfo<Value>& args) {
1765   Environment* env = Environment::GetCurrent(args);
1766   CHECK_GE(args.Length(), 1);
1767   CHECK(args[0]->IsString());
1768   Utf8Value value(env->isolate(), args[0]);
1769 
1770   URLHost host;
1771   // Assuming the host is used for a special scheme.
1772   host.ParseHost(*value, value.length(), true, true);
1773   if (host.ParsingFailed()) {
1774     args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), ""));
1775     return;
1776   }
1777   std::string out = host.ToStringMove();
1778   args.GetReturnValue().Set(
1779       String::NewFromUtf8(env->isolate(), out.c_str()).ToLocalChecked());
1780 }
1781 
SetURLConstructor(const FunctionCallbackInfo<Value> & args)1782 void SetURLConstructor(const FunctionCallbackInfo<Value>& args) {
1783   Environment* env = Environment::GetCurrent(args);
1784   CHECK_EQ(args.Length(), 1);
1785   CHECK(args[0]->IsFunction());
1786   env->set_url_constructor_function(args[0].As<Function>());
1787 }
1788 
Initialize(Local<Object> target,Local<Value> unused,Local<Context> context,void * priv)1789 void Initialize(Local<Object> target,
1790                 Local<Value> unused,
1791                 Local<Context> context,
1792                 void* priv) {
1793   Environment* env = Environment::GetCurrent(context);
1794   env->SetMethod(target, "parse", Parse);
1795   env->SetMethodNoSideEffect(target, "encodeAuth", EncodeAuthSet);
1796   env->SetMethodNoSideEffect(target, "toUSVString", ToUSVString);
1797   env->SetMethodNoSideEffect(target, "domainToASCII", DomainToASCII);
1798   env->SetMethodNoSideEffect(target, "domainToUnicode", DomainToUnicode);
1799   env->SetMethod(target, "setURLConstructor", SetURLConstructor);
1800 
1801 #define XX(name, _) NODE_DEFINE_CONSTANT(target, name);
1802   FLAGS(XX)
1803 #undef XX
1804 
1805 #define XX(name) NODE_DEFINE_CONSTANT(target, name);
1806   PARSESTATES(XX)
1807 #undef XX
1808 }
1809 }  // namespace
1810 
ToFilePath() const1811 std::string URL::ToFilePath() const {
1812   if (context_.scheme != "file:") {
1813     return "";
1814   }
1815 
1816 #ifdef _WIN32
1817   const char* slash = "\\";
1818   auto is_slash = [] (char ch) {
1819     return ch == '/' || ch == '\\';
1820   };
1821 #else
1822   const char* slash = "/";
1823   auto is_slash = [] (char ch) {
1824     return ch == '/';
1825   };
1826   if ((context_.flags & URL_FLAGS_HAS_HOST) &&
1827       context_.host.length() > 0) {
1828     return "";
1829   }
1830 #endif
1831   std::string decoded_path;
1832   for (const std::string& part : context_.path) {
1833     std::string decoded = PercentDecode(part.c_str(), part.length());
1834     for (char& ch : decoded) {
1835       if (is_slash(ch)) {
1836         return "";
1837       }
1838     }
1839     decoded_path += slash + decoded;
1840   }
1841 
1842 #ifdef _WIN32
1843   // TODO(TimothyGu): Use "\\?\" long paths on Windows.
1844 
1845   // If hostname is set, then we have a UNC path. Pass the hostname through
1846   // ToUnicode just in case it is an IDN using punycode encoding. We do not
1847   // need to worry about percent encoding because the URL parser will have
1848   // already taken care of that for us. Note that this only causes IDNs with an
1849   // appropriate `xn--` prefix to be decoded.
1850   if ((context_.flags & URL_FLAGS_HAS_HOST) &&
1851       context_.host.length() > 0) {
1852     std::string unicode_host;
1853     if (!ToUnicode(context_.host, &unicode_host)) {
1854       return "";
1855     }
1856     return "\\\\" + unicode_host + decoded_path;
1857   }
1858   // Otherwise, it's a local path that requires a drive letter.
1859   if (decoded_path.length() < 3) {
1860     return "";
1861   }
1862   if (decoded_path[2] != ':' ||
1863       !IsASCIIAlpha(decoded_path[1])) {
1864     return "";
1865   }
1866   // Strip out the leading '\'.
1867   return decoded_path.substr(1);
1868 #else
1869   return decoded_path;
1870 #endif
1871 }
1872 
FromFilePath(const std::string & file_path)1873 URL URL::FromFilePath(const std::string& file_path) {
1874   URL url("file://");
1875   std::string escaped_file_path;
1876   for (size_t i = 0; i < file_path.length(); ++i) {
1877     escaped_file_path += file_path[i];
1878     if (file_path[i] == '%')
1879       escaped_file_path += "25";
1880   }
1881   URL::Parse(escaped_file_path.c_str(), escaped_file_path.length(), kPathStart,
1882              &url.context_, true, nullptr, false);
1883   return url;
1884 }
1885 
1886 // This function works by calling out to a JS function that creates and
1887 // returns the JS URL object. Be mindful of the JS<->Native boundary
1888 // crossing that is required.
ToObject(Environment * env) const1889 MaybeLocal<Value> URL::ToObject(Environment* env) const {
1890   Isolate* isolate = env->isolate();
1891   Local<Context> context = env->context();
1892   Context::Scope context_scope(context);
1893 
1894   const Local<Value> undef = Undefined(isolate);
1895   const Local<Value> null = Null(isolate);
1896 
1897   if (context_.flags & URL_FLAGS_FAILED)
1898     return Local<Value>();
1899 
1900   Local<Value> argv[] = {
1901     undef,
1902     undef,
1903     undef,
1904     undef,
1905     null,  // host defaults to null
1906     null,  // port defaults to null
1907     undef,
1908     null,  // query defaults to null
1909     null,  // fragment defaults to null
1910   };
1911   SetArgs(env, argv, context_);
1912 
1913   MaybeLocal<Value> ret;
1914   {
1915     TryCatchScope try_catch(env, TryCatchScope::CatchMode::kFatal);
1916 
1917     // The SetURLConstructor method must have been called already to
1918     // set the constructor function used below. SetURLConstructor is
1919     // called automatically when the internal/url.js module is loaded
1920     // during the internal/bootstrap/node.js processing.
1921     ret = env->url_constructor_function()
1922         ->Call(env->context(), undef, arraysize(argv), argv);
1923   }
1924 
1925   return ret;
1926 }
1927 
1928 }  // namespace url
1929 }  // namespace node
1930 
1931 NODE_MODULE_CONTEXT_AWARE_INTERNAL(url, node::url::Initialize)
1932