• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  * SPDX-License-Identifier: curl
22  *
23  ***************************************************************************/
24 
25 #include "curl_setup.h"
26 
27 #include "urldata.h"
28 #include "urlapi-int.h"
29 #include "strcase.h"
30 #include "url.h"
31 #include "escape.h"
32 #include "curl_ctype.h"
33 #include "inet_pton.h"
34 #include "inet_ntop.h"
35 #include "strdup.h"
36 #include "idn.h"
37 
38 /* The last 3 #include files should be in this order */
39 #include "curl_printf.h"
40 #include "curl_memory.h"
41 #include "memdebug.h"
42 
43   /* MS-DOS/Windows style drive prefix, eg c: in c:foo */
44 #define STARTS_WITH_DRIVE_PREFIX(str) \
45   ((('a' <= str[0] && str[0] <= 'z') || \
46     ('A' <= str[0] && str[0] <= 'Z')) && \
47    (str[1] == ':'))
48 
49   /* MS-DOS/Windows style drive prefix, optionally with
50    * a '|' instead of ':', followed by a slash or NUL */
51 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
52   ((('a' <= (str)[0] && (str)[0] <= 'z') || \
53     ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
54    ((str)[1] == ':' || (str)[1] == '|') && \
55    ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
56 
57 /* scheme is not URL encoded, the longest libcurl supported ones are... */
58 #define MAX_SCHEME_LEN 40
59 
60 /*
61  * If USE_IPV6 is disabled, we still want to parse IPv6 addresses, so make
62  * sure we have _some_ value for AF_INET6 without polluting our fake value
63  * everywhere.
64  */
65 #if !defined(USE_IPV6) && !defined(AF_INET6)
66 #define AF_INET6 (AF_INET + 1)
67 #endif
68 
69 /* Internal representation of CURLU. Point to URL-encoded strings. */
70 struct Curl_URL {
71   char *scheme;
72   char *user;
73   char *password;
74   char *options; /* IMAP only? */
75   char *host;
76   char *zoneid; /* for numerical IPv6 addresses */
77   char *port;
78   char *path;
79   char *query;
80   char *fragment;
81   unsigned short portnum; /* the numerical version (if 'port' is set) */
82   BIT(query_present);    /* to support blank */
83   BIT(fragment_present); /* to support blank */
84   BIT(guessed_scheme);   /* when a URL without scheme is parsed */
85 };
86 
87 #define DEFAULT_SCHEME "https"
88 
89 static CURLUcode parseurl_and_replace(const char *url, CURLU *u,
90                                       unsigned int flags);
91 
free_urlhandle(struct Curl_URL * u)92 static void free_urlhandle(struct Curl_URL *u)
93 {
94   free(u->scheme);
95   free(u->user);
96   free(u->password);
97   free(u->options);
98   free(u->host);
99   free(u->zoneid);
100   free(u->port);
101   free(u->path);
102   free(u->query);
103   free(u->fragment);
104 }
105 
106 /*
107  * Find the separator at the end of the hostname, or the '?' in cases like
108  * http://www.example.com?id=2380
109  */
find_host_sep(const char * url)110 static const char *find_host_sep(const char *url)
111 {
112   const char *sep;
113   const char *query;
114 
115   /* Find the start of the hostname */
116   sep = strstr(url, "//");
117   if(!sep)
118     sep = url;
119   else
120     sep += 2;
121 
122   query = strchr(sep, '?');
123   sep = strchr(sep, '/');
124 
125   if(!sep)
126     sep = url + strlen(url);
127 
128   if(!query)
129     query = url + strlen(url);
130 
131   return sep < query ? sep : query;
132 }
133 
134 /* convert CURLcode to CURLUcode */
135 #define cc2cu(x) ((x) == CURLE_TOO_LARGE ? CURLUE_TOO_LARGE :   \
136                   CURLUE_OUT_OF_MEMORY)
137 /*
138  * Decide whether a character in a URL must be escaped.
139  */
140 #define urlchar_needs_escaping(c) (!(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c)))
141 
142 static const char hexdigits[] = "0123456789abcdef";
143 /* urlencode_str() writes data into an output dynbuf and URL-encodes the
144  * spaces in the source URL accordingly.
145  *
146  * URL encoding should be skipped for hostnames, otherwise IDN resolution
147  * will fail.
148  */
urlencode_str(struct dynbuf * o,const char * url,size_t len,bool relative,bool query)149 static CURLUcode urlencode_str(struct dynbuf *o, const char *url,
150                                size_t len, bool relative,
151                                bool query)
152 {
153   /* we must add this with whitespace-replacing */
154   bool left = !query;
155   const unsigned char *iptr;
156   const unsigned char *host_sep = (const unsigned char *) url;
157   CURLcode result;
158 
159   if(!relative)
160     host_sep = (const unsigned char *) find_host_sep(url);
161 
162   for(iptr = (unsigned char *)url;    /* read from here */
163       len; iptr++, len--) {
164 
165     if(iptr < host_sep) {
166       result = Curl_dyn_addn(o, iptr, 1);
167       if(result)
168         return cc2cu(result);
169       continue;
170     }
171 
172     if(*iptr == ' ') {
173       if(left)
174         result = Curl_dyn_addn(o, "%20", 3);
175       else
176         result = Curl_dyn_addn(o, "+", 1);
177       if(result)
178         return cc2cu(result);
179       continue;
180     }
181 
182     if(*iptr == '?')
183       left = FALSE;
184 
185     if(urlchar_needs_escaping(*iptr)) {
186       char out[3]={'%'};
187       out[1] = hexdigits[*iptr >> 4];
188       out[2] = hexdigits[*iptr & 0xf];
189       result = Curl_dyn_addn(o, out, 3);
190     }
191     else
192       result = Curl_dyn_addn(o, iptr, 1);
193     if(result)
194       return cc2cu(result);
195   }
196 
197   return CURLUE_OK;
198 }
199 
200 /*
201  * Returns the length of the scheme if the given URL is absolute (as opposed
202  * to relative). Stores the scheme in the buffer if TRUE and 'buf' is
203  * non-NULL. The buflen must be larger than MAX_SCHEME_LEN if buf is set.
204  *
205  * If 'guess_scheme' is TRUE, it means the URL might be provided without
206  * scheme.
207  */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen,bool guess_scheme)208 size_t Curl_is_absolute_url(const char *url, char *buf, size_t buflen,
209                             bool guess_scheme)
210 {
211   size_t i = 0;
212   DEBUGASSERT(!buf || (buflen > MAX_SCHEME_LEN));
213   (void)buflen; /* only used in debug-builds */
214   if(buf)
215     buf[0] = 0; /* always leave a defined value in buf */
216 #ifdef _WIN32
217   if(guess_scheme && STARTS_WITH_DRIVE_PREFIX(url))
218     return 0;
219 #endif
220   if(ISALPHA(url[0]))
221     for(i = 1; i < MAX_SCHEME_LEN; ++i) {
222       char s = url[i];
223       if(s && (ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') )) {
224         /* RFC 3986 3.1 explains:
225            scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
226         */
227       }
228       else {
229         break;
230       }
231     }
232   if(i && (url[i] == ':') && ((url[i + 1] == '/') || !guess_scheme)) {
233     /* If this does not guess scheme, the scheme always ends with the colon so
234        that this also detects data: URLs etc. In guessing mode, data: could
235        be the hostname "data" with a specified port number. */
236 
237     /* the length of the scheme is the name part only */
238     size_t len = i;
239     if(buf) {
240       Curl_strntolower(buf, url, i);
241       buf[i] = 0;
242     }
243     return len;
244   }
245   return 0;
246 }
247 
248 /*
249  * Concatenate a relative URL to a base URL making it absolute.
250  *
251  * Note that this function destroys the 'base' string.
252  */
redirect_url(char * base,const char * relurl,CURLU * u,unsigned int flags)253 static CURLUcode redirect_url(char *base, const char *relurl,
254                               CURLU *u, unsigned int flags)
255 {
256   struct dynbuf urlbuf;
257   bool host_changed = FALSE;
258   const char *useurl = relurl;
259   CURLcode result = CURLE_OK;
260   CURLUcode uc;
261   /* protsep points to the start of the hostname */
262   char *protsep = strstr(base, "//");
263   DEBUGASSERT(protsep);
264   if(!protsep)
265     protsep = base;
266   else
267     protsep += 2; /* pass the slashes */
268 
269   if(('/' != relurl[0]) && ('#' != relurl[0])) {
270     /* First we need to find out if there is a ?-letter in the original URL,
271        and cut it and the right-side of that off */
272     char *pathsep = strchr(protsep, '?');
273     if(pathsep)
274       *pathsep = 0;
275     else {
276       /* if not, cut off the potential fragment */
277       pathsep = strchr(protsep, '#');
278       if(pathsep)
279         *pathsep = 0;
280     }
281 
282     /* if the redirect-to piece is not just a query, cut the path after the
283        last slash */
284     if(useurl[0] != '?') {
285       pathsep = strrchr(protsep, '/');
286       if(pathsep)
287         pathsep[1] = 0; /* leave the slash */
288     }
289   }
290   else if('/' == relurl[0]) {
291     /* We got a new absolute path for this server */
292 
293     if(relurl[1] == '/') {
294       /* the new URL starts with //, just keep the protocol part from the
295          original one */
296       *protsep = 0;
297       useurl = &relurl[2]; /* we keep the slashes from the original, so we
298                               skip the new ones */
299       host_changed = TRUE;
300     }
301     else {
302       /* cut the original URL at first slash */
303       char *pathsep = strchr(protsep, '/');
304       if(pathsep)
305         *pathsep = 0;
306     }
307   }
308   else {
309     /* the relative piece starts with '#' */
310 
311     /* If there is a fragment in the original URL, cut it off */
312     char *pathsep = strchr(protsep, '#');
313     if(pathsep)
314       *pathsep = 0;
315   }
316 
317   Curl_dyn_init(&urlbuf, CURL_MAX_INPUT_LENGTH);
318 
319   /* copy over the root URL part */
320   result = Curl_dyn_add(&urlbuf, base);
321   if(result)
322     return cc2cu(result);
323 
324   /* then append the new piece on the right side */
325   uc = urlencode_str(&urlbuf, useurl, strlen(useurl), !host_changed,
326                      FALSE);
327   if(!uc)
328     uc = parseurl_and_replace(Curl_dyn_ptr(&urlbuf), u,
329                               flags&~CURLU_PATH_AS_IS);
330   Curl_dyn_free(&urlbuf);
331   return uc;
332 }
333 
334 /* scan for byte values <= 31, 127 and sometimes space */
junkscan(const char * url,size_t * urllen,unsigned int flags)335 static CURLUcode junkscan(const char *url, size_t *urllen, unsigned int flags)
336 {
337   static const char badbytes[]={
338     /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
339     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
340     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
341     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
342     0x7f, 0x00 /* null-terminate */
343   };
344   size_t n = strlen(url);
345   size_t nfine;
346 
347   if(n > CURL_MAX_INPUT_LENGTH)
348     /* excessive input length */
349     return CURLUE_MALFORMED_INPUT;
350 
351   nfine = strcspn(url, badbytes);
352   if((nfine != n) ||
353      (!(flags & CURLU_ALLOW_SPACE) && strchr(url, ' ')))
354     return CURLUE_MALFORMED_INPUT;
355 
356   *urllen = n;
357   return CURLUE_OK;
358 }
359 
360 /*
361  * parse_hostname_login()
362  *
363  * Parse the login details (username, password and options) from the URL and
364  * strip them out of the hostname
365  *
366  */
parse_hostname_login(struct Curl_URL * u,const char * login,size_t len,unsigned int flags,size_t * offset)367 static CURLUcode parse_hostname_login(struct Curl_URL *u,
368                                       const char *login,
369                                       size_t len,
370                                       unsigned int flags,
371                                       size_t *offset) /* to the hostname */
372 {
373   CURLUcode result = CURLUE_OK;
374   CURLcode ccode;
375   char *userp = NULL;
376   char *passwdp = NULL;
377   char *optionsp = NULL;
378   const struct Curl_handler *h = NULL;
379 
380   /* At this point, we assume all the other special cases have been taken
381    * care of, so the host is at most
382    *
383    *   [user[:password][;options]]@]hostname
384    *
385    * We need somewhere to put the embedded details, so do that first.
386    */
387   char *ptr;
388 
389   DEBUGASSERT(login);
390 
391   *offset = 0;
392   ptr = memchr(login, '@', len);
393   if(!ptr)
394     goto out;
395 
396   /* We will now try to extract the
397    * possible login information in a string like:
398    * ftp://user:password@ftp.my.site:8021/README */
399   ptr++;
400 
401   /* if this is a known scheme, get some details */
402   if(u->scheme)
403     h = Curl_get_scheme_handler(u->scheme);
404 
405   /* We could use the login information in the URL so extract it. Only parse
406      options if the handler says we should. Note that 'h' might be NULL! */
407   ccode = Curl_parse_login_details(login, ptr - login - 1,
408                                    &userp, &passwdp,
409                                    (h && (h->flags & PROTOPT_URLOPTIONS)) ?
410                                    &optionsp : NULL);
411   if(ccode) {
412     result = CURLUE_BAD_LOGIN;
413     goto out;
414   }
415 
416   if(userp) {
417     if(flags & CURLU_DISALLOW_USER) {
418       /* Option DISALLOW_USER is set and URL contains username. */
419       result = CURLUE_USER_NOT_ALLOWED;
420       goto out;
421     }
422     free(u->user);
423     u->user = userp;
424   }
425 
426   if(passwdp) {
427     free(u->password);
428     u->password = passwdp;
429   }
430 
431   if(optionsp) {
432     free(u->options);
433     u->options = optionsp;
434   }
435 
436   /* the hostname starts at this offset */
437   *offset = ptr - login;
438   return CURLUE_OK;
439 
440 out:
441 
442   free(userp);
443   free(passwdp);
444   free(optionsp);
445   u->user = NULL;
446   u->password = NULL;
447   u->options = NULL;
448 
449   return result;
450 }
451 
Curl_parse_port(struct Curl_URL * u,struct dynbuf * host,bool has_scheme)452 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, struct dynbuf *host,
453                                    bool has_scheme)
454 {
455   char *portptr;
456   char *hostname = Curl_dyn_ptr(host);
457   /*
458    * Find the end of an IPv6 address on the ']' ending bracket.
459    */
460   if(hostname[0] == '[') {
461     portptr = strchr(hostname, ']');
462     if(!portptr)
463       return CURLUE_BAD_IPV6;
464     portptr++;
465     /* this is a RFC2732-style specified IP-address */
466     if(*portptr) {
467       if(*portptr != ':')
468         return CURLUE_BAD_PORT_NUMBER;
469     }
470     else
471       portptr = NULL;
472   }
473   else
474     portptr = strchr(hostname, ':');
475 
476   if(portptr) {
477     char *rest = NULL;
478     unsigned long port;
479     size_t keep = portptr - hostname;
480 
481     /* Browser behavior adaptation. If there is a colon with no digits after,
482        just cut off the name there which makes us ignore the colon and just
483        use the default port. Firefox, Chrome and Safari all do that.
484 
485        Do not do it if the URL has no scheme, to make something that looks like
486        a scheme not work!
487     */
488     Curl_dyn_setlen(host, keep);
489     portptr++;
490     if(!*portptr)
491       return has_scheme ? CURLUE_OK : CURLUE_BAD_PORT_NUMBER;
492 
493     if(!ISDIGIT(*portptr))
494       return CURLUE_BAD_PORT_NUMBER;
495 
496     errno = 0;
497     port = strtoul(portptr, &rest, 10);  /* Port number must be decimal */
498 
499     if(errno || (port > 0xffff) || *rest)
500       return CURLUE_BAD_PORT_NUMBER;
501 
502     u->portnum = (unsigned short) port;
503     /* generate a new port number string to get rid of leading zeroes etc */
504     free(u->port);
505     u->port = aprintf("%ld", port);
506     if(!u->port)
507       return CURLUE_OUT_OF_MEMORY;
508   }
509 
510   return CURLUE_OK;
511 }
512 
513 /* this assumes 'hostname' now starts with [ */
ipv6_parse(struct Curl_URL * u,char * hostname,size_t hlen)514 static CURLUcode ipv6_parse(struct Curl_URL *u, char *hostname,
515                             size_t hlen) /* length of hostname */
516 {
517   size_t len;
518   DEBUGASSERT(*hostname == '[');
519   if(hlen < 4) /* '[::]' is the shortest possible valid string */
520     return CURLUE_BAD_IPV6;
521   hostname++;
522   hlen -= 2;
523 
524   /* only valid IPv6 letters are ok */
525   len = strspn(hostname, "0123456789abcdefABCDEF:.");
526 
527   if(hlen != len) {
528     hlen = len;
529     if(hostname[len] == '%') {
530       /* this could now be '%[zone id]' */
531       char zoneid[16];
532       int i = 0;
533       char *h = &hostname[len + 1];
534       /* pass '25' if present and is a URL encoded percent sign */
535       if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
536         h += 2;
537       while(*h && (*h != ']') && (i < 15))
538         zoneid[i++] = *h++;
539       if(!i || (']' != *h))
540         return CURLUE_BAD_IPV6;
541       zoneid[i] = 0;
542       u->zoneid = strdup(zoneid);
543       if(!u->zoneid)
544         return CURLUE_OUT_OF_MEMORY;
545       hostname[len] = ']'; /* insert end bracket */
546       hostname[len + 1] = 0; /* terminate the hostname */
547     }
548     else
549       return CURLUE_BAD_IPV6;
550     /* hostname is fine */
551   }
552 
553   /* Normalize the IPv6 address */
554   {
555     char dest[16]; /* fits a binary IPv6 address */
556     hostname[hlen] = 0; /* end the address there */
557     if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
558       return CURLUE_BAD_IPV6;
559     if(Curl_inet_ntop(AF_INET6, dest, hostname, hlen)) {
560       hlen = strlen(hostname); /* might be shorter now */
561       hostname[hlen + 1] = 0;
562     }
563     hostname[hlen] = ']'; /* restore ending bracket */
564   }
565   return CURLUE_OK;
566 }
567 
hostname_check(struct Curl_URL * u,char * hostname,size_t hlen)568 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname,
569                                 size_t hlen) /* length of hostname */
570 {
571   size_t len;
572   DEBUGASSERT(hostname);
573 
574   if(!hlen)
575     return CURLUE_NO_HOST;
576   else if(hostname[0] == '[')
577     return ipv6_parse(u, hostname, hlen);
578   else {
579     /* letters from the second string are not ok */
580     len = strcspn(hostname, " \r\n\t/:#?!@{}[]\\$\'\"^`*<>=;,+&()%");
581     if(hlen != len)
582       /* hostname with bad content */
583       return CURLUE_BAD_HOSTNAME;
584   }
585   return CURLUE_OK;
586 }
587 
588 /*
589  * Handle partial IPv4 numerical addresses and different bases, like
590  * '16843009', '0x7f', '0x7f.1' '0177.1.1.1' etc.
591  *
592  * If the given input string is syntactically wrong IPv4 or any part for
593  * example is too big, this function returns HOST_NAME.
594  *
595  * Output the "normalized" version of that input string in plain quad decimal
596  * integers.
597  *
598  * Returns the host type.
599  */
600 
601 #define HOST_ERROR   -1 /* out of memory */
602 
603 #define HOST_NAME    1
604 #define HOST_IPV4    2
605 #define HOST_IPV6    3
606 
ipv4_normalize(struct dynbuf * host)607 static int ipv4_normalize(struct dynbuf *host)
608 {
609   bool done = FALSE;
610   int n = 0;
611   const char *c = Curl_dyn_ptr(host);
612   unsigned long parts[4] = {0, 0, 0, 0};
613   CURLcode result = CURLE_OK;
614 
615   if(*c == '[')
616     return HOST_IPV6;
617 
618   errno = 0; /* for strtoul */
619   while(!done) {
620     char *endp = NULL;
621     unsigned long l;
622     if(!ISDIGIT(*c))
623       /* most importantly this does not allow a leading plus or minus */
624       return HOST_NAME;
625     l = strtoul(c, &endp, 0);
626     if(errno)
627       return HOST_NAME;
628 #if SIZEOF_LONG > 4
629     /* a value larger than 32 bits */
630     if(l > UINT_MAX)
631       return HOST_NAME;
632 #endif
633 
634     parts[n] = l;
635     c = endp;
636 
637     switch(*c) {
638     case '.':
639       if(n == 3)
640         return HOST_NAME;
641       n++;
642       c++;
643       break;
644 
645     case '\0':
646       done = TRUE;
647       break;
648 
649     default:
650       return HOST_NAME;
651     }
652   }
653 
654   switch(n) {
655   case 0: /* a -- 32 bits */
656     Curl_dyn_reset(host);
657 
658     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
659                            (unsigned int)(parts[0] >> 24),
660                            (unsigned int)((parts[0] >> 16) & 0xff),
661                            (unsigned int)((parts[0] >> 8) & 0xff),
662                            (unsigned int)(parts[0] & 0xff));
663     break;
664   case 1: /* a.b -- 8.24 bits */
665     if((parts[0] > 0xff) || (parts[1] > 0xffffff))
666       return HOST_NAME;
667     Curl_dyn_reset(host);
668     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
669                            (unsigned int)(parts[0]),
670                            (unsigned int)((parts[1] >> 16) & 0xff),
671                            (unsigned int)((parts[1] >> 8) & 0xff),
672                            (unsigned int)(parts[1] & 0xff));
673     break;
674   case 2: /* a.b.c -- 8.8.16 bits */
675     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xffff))
676       return HOST_NAME;
677     Curl_dyn_reset(host);
678     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
679                            (unsigned int)(parts[0]),
680                            (unsigned int)(parts[1]),
681                            (unsigned int)((parts[2] >> 8) & 0xff),
682                            (unsigned int)(parts[2] & 0xff));
683     break;
684   case 3: /* a.b.c.d -- 8.8.8.8 bits */
685     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xff) ||
686        (parts[3] > 0xff))
687       return HOST_NAME;
688     Curl_dyn_reset(host);
689     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
690                            (unsigned int)(parts[0]),
691                            (unsigned int)(parts[1]),
692                            (unsigned int)(parts[2]),
693                            (unsigned int)(parts[3]));
694     break;
695   }
696   if(result)
697     return HOST_ERROR;
698   return HOST_IPV4;
699 }
700 
701 /* if necessary, replace the host content with a URL decoded version */
urldecode_host(struct dynbuf * host)702 static CURLUcode urldecode_host(struct dynbuf *host)
703 {
704   char *per = NULL;
705   const char *hostname = Curl_dyn_ptr(host);
706   per = strchr(hostname, '%');
707   if(!per)
708     /* nothing to decode */
709     return CURLUE_OK;
710   else {
711     /* encoded */
712     size_t dlen;
713     char *decoded;
714     CURLcode result = Curl_urldecode(hostname, 0, &decoded, &dlen,
715                                      REJECT_CTRL);
716     if(result)
717       return CURLUE_BAD_HOSTNAME;
718     Curl_dyn_reset(host);
719     result = Curl_dyn_addn(host, decoded, dlen);
720     free(decoded);
721     if(result)
722       return cc2cu(result);
723   }
724 
725   return CURLUE_OK;
726 }
727 
parse_authority(struct Curl_URL * u,const char * auth,size_t authlen,unsigned int flags,struct dynbuf * host,bool has_scheme)728 static CURLUcode parse_authority(struct Curl_URL *u,
729                                  const char *auth, size_t authlen,
730                                  unsigned int flags,
731                                  struct dynbuf *host,
732                                  bool has_scheme)
733 {
734   size_t offset;
735   CURLUcode uc;
736   CURLcode result;
737 
738   /*
739    * Parse the login details and strip them out of the hostname.
740    */
741   uc = parse_hostname_login(u, auth, authlen, flags, &offset);
742   if(uc)
743     goto out;
744 
745   result = Curl_dyn_addn(host, auth + offset, authlen - offset);
746   if(result) {
747     uc = cc2cu(result);
748     goto out;
749   }
750 
751   uc = Curl_parse_port(u, host, has_scheme);
752   if(uc)
753     goto out;
754 
755   if(!Curl_dyn_len(host))
756     return CURLUE_NO_HOST;
757 
758   switch(ipv4_normalize(host)) {
759   case HOST_IPV4:
760     break;
761   case HOST_IPV6:
762     uc = ipv6_parse(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
763     break;
764   case HOST_NAME:
765     uc = urldecode_host(host);
766     if(!uc)
767       uc = hostname_check(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
768     break;
769   case HOST_ERROR:
770     uc = CURLUE_OUT_OF_MEMORY;
771     break;
772   default:
773     uc = CURLUE_BAD_HOSTNAME; /* Bad IPv4 address even */
774     break;
775   }
776 
777 out:
778   return uc;
779 }
780 
781 /* used for HTTP/2 server push */
Curl_url_set_authority(CURLU * u,const char * authority)782 CURLUcode Curl_url_set_authority(CURLU *u, const char *authority)
783 {
784   CURLUcode result;
785   struct dynbuf host;
786 
787   DEBUGASSERT(authority);
788   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
789 
790   result = parse_authority(u, authority, strlen(authority),
791                            CURLU_DISALLOW_USER, &host, !!u->scheme);
792   if(result)
793     Curl_dyn_free(&host);
794   else {
795     free(u->host);
796     u->host = Curl_dyn_ptr(&host);
797   }
798   return result;
799 }
800 
801 /*
802  * "Remove Dot Segments"
803  * https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
804  */
805 
806 /*
807  * dedotdotify()
808  * @unittest: 1395
809  *
810  * This function gets a null-terminated path with dot and dotdot sequences
811  * passed in and strips them off according to the rules in RFC 3986 section
812  * 5.2.4.
813  *
814  * The function handles a query part ('?' + stuff) appended but it expects
815  * that fragments ('#' + stuff) have already been cut off.
816  *
817  * RETURNS
818  *
819  * Zero for success and 'out' set to an allocated dedotdotified string.
820  */
821 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp);
dedotdotify(const char * input,size_t clen,char ** outp)822 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp)
823 {
824   char *outptr;
825   const char *endp = &input[clen];
826   char *out;
827 
828   *outp = NULL;
829   /* the path always starts with a slash, and a slash has not dot */
830   if((clen < 2) || !memchr(input, '.', clen))
831     return 0;
832 
833   out = malloc(clen + 1);
834   if(!out)
835     return 1; /* out of memory */
836 
837   *out = 0; /* null-terminates, for inputs like "./" */
838   outptr = out;
839 
840   do {
841     bool dotdot = TRUE;
842     if(*input == '.') {
843       /*  A. If the input buffer begins with a prefix of "../" or "./", then
844           remove that prefix from the input buffer; otherwise, */
845 
846       if(!strncmp("./", input, 2)) {
847         input += 2;
848         clen -= 2;
849       }
850       else if(!strncmp("../", input, 3)) {
851         input += 3;
852         clen -= 3;
853       }
854       /*  D. if the input buffer consists only of "." or "..", then remove
855           that from the input buffer; otherwise, */
856 
857       else if(!strcmp(".", input) || !strcmp("..", input) ||
858               !strncmp(".?", input, 2) || !strncmp("..?", input, 3)) {
859         *out = 0;
860         break;
861       }
862       else
863         dotdot = FALSE;
864     }
865     else if(*input == '/') {
866       /*  B. if the input buffer begins with a prefix of "/./" or "/.", where
867           "."  is a complete path segment, then replace that prefix with "/" in
868           the input buffer; otherwise, */
869       if(!strncmp("/./", input, 3)) {
870         input += 2;
871         clen -= 2;
872       }
873       else if(!strcmp("/.", input) || !strncmp("/.?", input, 3)) {
874         *outptr++ = '/';
875         *outptr = 0;
876         break;
877       }
878 
879       /*  C. if the input buffer begins with a prefix of "/../" or "/..",
880           where ".." is a complete path segment, then replace that prefix with
881           "/" in the input buffer and remove the last segment and its
882           preceding "/" (if any) from the output buffer; otherwise, */
883 
884       else if(!strncmp("/../", input, 4)) {
885         input += 3;
886         clen -= 3;
887         /* remove the last segment from the output buffer */
888         while(outptr > out) {
889           outptr--;
890           if(*outptr == '/')
891             break;
892         }
893         *outptr = 0; /* null-terminate where it stops */
894       }
895       else if(!strcmp("/..", input) || !strncmp("/..?", input, 4)) {
896         /* remove the last segment from the output buffer */
897         while(outptr > out) {
898           outptr--;
899           if(*outptr == '/')
900             break;
901         }
902         *outptr++ = '/';
903         *outptr = 0; /* null-terminate where it stops */
904         break;
905       }
906       else
907         dotdot = FALSE;
908     }
909     else
910       dotdot = FALSE;
911 
912     if(!dotdot) {
913       /*  E. move the first path segment in the input buffer to the end of
914           the output buffer, including the initial "/" character (if any) and
915           any subsequent characters up to, but not including, the next "/"
916           character or the end of the input buffer. */
917 
918       do {
919         *outptr++ = *input++;
920         clen--;
921       } while(*input && (*input != '/') && (*input != '?'));
922       *outptr = 0;
923     }
924 
925     /* continue until end of path */
926   } while(input < endp);
927 
928   *outp = out;
929   return 0; /* success */
930 }
931 
parseurl(const char * url,CURLU * u,unsigned int flags)932 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
933 {
934   const char *path;
935   size_t pathlen;
936   char *query = NULL;
937   char *fragment = NULL;
938   char schemebuf[MAX_SCHEME_LEN + 1];
939   size_t schemelen = 0;
940   size_t urllen;
941   CURLUcode result = CURLUE_OK;
942   size_t fraglen = 0;
943   struct dynbuf host;
944 
945   DEBUGASSERT(url);
946 
947   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
948 
949   result = junkscan(url, &urllen, flags);
950   if(result)
951     goto fail;
952 
953   schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf),
954                                    flags & (CURLU_GUESS_SCHEME|
955                                             CURLU_DEFAULT_SCHEME));
956 
957   /* handle the file: scheme */
958   if(schemelen && !strcmp(schemebuf, "file")) {
959     bool uncpath = FALSE;
960     if(urllen <= 6) {
961       /* file:/ is not enough to actually be a complete file: URL */
962       result = CURLUE_BAD_FILE_URL;
963       goto fail;
964     }
965 
966     /* path has been allocated large enough to hold this */
967     path = (char *)&url[5];
968     pathlen = urllen - 5;
969 
970     u->scheme = strdup("file");
971     if(!u->scheme) {
972       result = CURLUE_OUT_OF_MEMORY;
973       goto fail;
974     }
975 
976     /* Extra handling URLs with an authority component (i.e. that start with
977      * "file://")
978      *
979      * We allow omitted hostname (e.g. file:/<path>) -- valid according to
980      * RFC 8089, but not the (current) WHAT-WG URL spec.
981      */
982     if(path[0] == '/' && path[1] == '/') {
983       /* swallow the two slashes */
984       const char *ptr = &path[2];
985 
986       /*
987        * According to RFC 8089, a file: URL can be reliably dereferenced if:
988        *
989        *  o it has no/blank hostname, or
990        *
991        *  o the hostname matches "localhost" (case-insensitively), or
992        *
993        *  o the hostname is a FQDN that resolves to this machine, or
994        *
995        *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
996        *    Appendix E.3).
997        *
998        * For brevity, we only consider URLs with empty, "localhost", or
999        * "127.0.0.1" hostnames as local, otherwise as an UNC String.
1000        *
1001        * Additionally, there is an exception for URLs with a Windows drive
1002        * letter in the authority (which was accidentally omitted from RFC 8089
1003        * Appendix E, but believe me, it was meant to be there. --MK)
1004        */
1005       if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
1006         /* the URL includes a hostname, it must match "localhost" or
1007            "127.0.0.1" to be valid */
1008         if(checkprefix("localhost/", ptr) ||
1009            checkprefix("127.0.0.1/", ptr)) {
1010           ptr += 9; /* now points to the slash after the host */
1011         }
1012         else {
1013 #if defined(_WIN32)
1014           size_t len;
1015 
1016           /* the hostname, NetBIOS computer name, can not contain disallowed
1017              chars, and the delimiting slash character must be appended to the
1018              hostname */
1019           path = strpbrk(ptr, "/\\:*?\"<>|");
1020           if(!path || *path != '/') {
1021             result = CURLUE_BAD_FILE_URL;
1022             goto fail;
1023           }
1024 
1025           len = path - ptr;
1026           if(len) {
1027             CURLcode code = Curl_dyn_addn(&host, ptr, len);
1028             if(code) {
1029               result = cc2cu(code);
1030               goto fail;
1031             }
1032             uncpath = TRUE;
1033           }
1034 
1035           ptr -= 2; /* now points to the // before the host in UNC */
1036 #else
1037           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
1038              none */
1039           result = CURLUE_BAD_FILE_URL;
1040           goto fail;
1041 #endif
1042         }
1043       }
1044 
1045       path = ptr;
1046       pathlen = urllen - (ptr - url);
1047     }
1048 
1049     if(!uncpath)
1050       /* no host for file: URLs by default */
1051       Curl_dyn_reset(&host);
1052 
1053 #if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__)
1054     /* Do not allow Windows drive letters when not in Windows.
1055      * This catches both "file:/c:" and "file:c:" */
1056     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
1057        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
1058       /* File drive letters are only accepted in MS-DOS/Windows */
1059       result = CURLUE_BAD_FILE_URL;
1060       goto fail;
1061     }
1062 #else
1063     /* If the path starts with a slash and a drive letter, ditch the slash */
1064     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
1065       /* This cannot be done with strcpy, as the memory chunks overlap! */
1066       path++;
1067       pathlen--;
1068     }
1069 #endif
1070 
1071   }
1072   else {
1073     /* clear path */
1074     const char *schemep = NULL;
1075     const char *hostp;
1076     size_t hostlen;
1077 
1078     if(schemelen) {
1079       int i = 0;
1080       const char *p = &url[schemelen + 1];
1081       while((*p == '/') && (i < 4)) {
1082         p++;
1083         i++;
1084       }
1085 
1086       schemep = schemebuf;
1087       if(!Curl_get_scheme_handler(schemep) &&
1088          !(flags & CURLU_NON_SUPPORT_SCHEME)) {
1089         result = CURLUE_UNSUPPORTED_SCHEME;
1090         goto fail;
1091       }
1092 
1093       if((i < 1) || (i > 3)) {
1094         /* less than one or more than three slashes */
1095         result = CURLUE_BAD_SLASHES;
1096         goto fail;
1097       }
1098       hostp = p; /* hostname starts here */
1099     }
1100     else {
1101       /* no scheme! */
1102 
1103       if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME))) {
1104         result = CURLUE_BAD_SCHEME;
1105         goto fail;
1106       }
1107       if(flags & CURLU_DEFAULT_SCHEME)
1108         schemep = DEFAULT_SCHEME;
1109 
1110       /*
1111        * The URL was badly formatted, let's try without scheme specified.
1112        */
1113       hostp = url;
1114     }
1115 
1116     if(schemep) {
1117       u->scheme = strdup(schemep);
1118       if(!u->scheme) {
1119         result = CURLUE_OUT_OF_MEMORY;
1120         goto fail;
1121       }
1122     }
1123 
1124     /* find the end of the hostname + port number */
1125     hostlen = strcspn(hostp, "/?#");
1126     path = &hostp[hostlen];
1127 
1128     /* this pathlen also contains the query and the fragment */
1129     pathlen = urllen - (path - url);
1130     if(hostlen) {
1131 
1132       result = parse_authority(u, hostp, hostlen, flags, &host, schemelen);
1133       if(result)
1134         goto fail;
1135 
1136       if((flags & CURLU_GUESS_SCHEME) && !schemep) {
1137         const char *hostname = Curl_dyn_ptr(&host);
1138         /* legacy curl-style guess based on hostname */
1139         if(checkprefix("ftp.", hostname))
1140           schemep = "ftp";
1141         else if(checkprefix("dict.", hostname))
1142           schemep = "dict";
1143         else if(checkprefix("ldap.", hostname))
1144           schemep = "ldap";
1145         else if(checkprefix("imap.", hostname))
1146           schemep = "imap";
1147         else if(checkprefix("smtp.", hostname))
1148           schemep = "smtp";
1149         else if(checkprefix("pop3.", hostname))
1150           schemep = "pop3";
1151         else
1152           schemep = "http";
1153 
1154         u->scheme = strdup(schemep);
1155         if(!u->scheme) {
1156           result = CURLUE_OUT_OF_MEMORY;
1157           goto fail;
1158         }
1159         u->guessed_scheme = TRUE;
1160       }
1161     }
1162     else if(flags & CURLU_NO_AUTHORITY) {
1163       /* allowed to be empty. */
1164       if(Curl_dyn_add(&host, "")) {
1165         result = CURLUE_OUT_OF_MEMORY;
1166         goto fail;
1167       }
1168     }
1169     else {
1170       result = CURLUE_NO_HOST;
1171       goto fail;
1172     }
1173   }
1174 
1175   fragment = strchr(path, '#');
1176   if(fragment) {
1177     fraglen = pathlen - (fragment - path);
1178     u->fragment_present = TRUE;
1179     if(fraglen > 1) {
1180       /* skip the leading '#' in the copy but include the terminating null */
1181       if(flags & CURLU_URLENCODE) {
1182         struct dynbuf enc;
1183         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1184         result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
1185         if(result)
1186           goto fail;
1187         u->fragment = Curl_dyn_ptr(&enc);
1188       }
1189       else {
1190         u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
1191         if(!u->fragment) {
1192           result = CURLUE_OUT_OF_MEMORY;
1193           goto fail;
1194         }
1195       }
1196     }
1197     /* after this, pathlen still contains the query */
1198     pathlen -= fraglen;
1199   }
1200 
1201   query = memchr(path, '?', pathlen);
1202   if(query) {
1203     size_t qlen = fragment ? (size_t)(fragment - query) :
1204       pathlen - (query - path);
1205     pathlen -= qlen;
1206     u->query_present = TRUE;
1207     if(qlen > 1) {
1208       if(flags & CURLU_URLENCODE) {
1209         struct dynbuf enc;
1210         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1211         /* skip the leading question mark */
1212         result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
1213         if(result)
1214           goto fail;
1215         u->query = Curl_dyn_ptr(&enc);
1216       }
1217       else {
1218         u->query = Curl_memdup0(query + 1, qlen - 1);
1219         if(!u->query) {
1220           result = CURLUE_OUT_OF_MEMORY;
1221           goto fail;
1222         }
1223       }
1224     }
1225     else {
1226       /* single byte query */
1227       u->query = strdup("");
1228       if(!u->query) {
1229         result = CURLUE_OUT_OF_MEMORY;
1230         goto fail;
1231       }
1232     }
1233   }
1234 
1235   if(pathlen && (flags & CURLU_URLENCODE)) {
1236     struct dynbuf enc;
1237     Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1238     result = urlencode_str(&enc, path, pathlen, TRUE, FALSE);
1239     if(result)
1240       goto fail;
1241     pathlen = Curl_dyn_len(&enc);
1242     path = u->path = Curl_dyn_ptr(&enc);
1243   }
1244 
1245   if(pathlen <= 1) {
1246     /* there is no path left or just the slash, unset */
1247     path = NULL;
1248   }
1249   else {
1250     if(!u->path) {
1251       u->path = Curl_memdup0(path, pathlen);
1252       if(!u->path) {
1253         result = CURLUE_OUT_OF_MEMORY;
1254         goto fail;
1255       }
1256       path = u->path;
1257     }
1258     else if(flags & CURLU_URLENCODE)
1259       /* it might have encoded more than just the path so cut it */
1260       u->path[pathlen] = 0;
1261 
1262     if(!(flags & CURLU_PATH_AS_IS)) {
1263       /* remove ../ and ./ sequences according to RFC3986 */
1264       char *dedot;
1265       int err = dedotdotify((char *)path, pathlen, &dedot);
1266       if(err) {
1267         result = CURLUE_OUT_OF_MEMORY;
1268         goto fail;
1269       }
1270       if(dedot) {
1271         free(u->path);
1272         u->path = dedot;
1273       }
1274     }
1275   }
1276 
1277   u->host = Curl_dyn_ptr(&host);
1278 
1279   return result;
1280 fail:
1281   Curl_dyn_free(&host);
1282   free_urlhandle(u);
1283   return result;
1284 }
1285 
1286 /*
1287  * Parse the URL and, if successful, replace everything in the Curl_URL struct.
1288  */
parseurl_and_replace(const char * url,CURLU * u,unsigned int flags)1289 static CURLUcode parseurl_and_replace(const char *url, CURLU *u,
1290                                       unsigned int flags)
1291 {
1292   CURLUcode result;
1293   CURLU tmpurl;
1294   memset(&tmpurl, 0, sizeof(tmpurl));
1295   result = parseurl(url, &tmpurl, flags);
1296   if(!result) {
1297     free_urlhandle(u);
1298     *u = tmpurl;
1299   }
1300   return result;
1301 }
1302 
1303 /*
1304  */
curl_url(void)1305 CURLU *curl_url(void)
1306 {
1307   return calloc(1, sizeof(struct Curl_URL));
1308 }
1309 
curl_url_cleanup(CURLU * u)1310 void curl_url_cleanup(CURLU *u)
1311 {
1312   if(u) {
1313     free_urlhandle(u);
1314     free(u);
1315   }
1316 }
1317 
1318 #define DUP(dest, src, name)                    \
1319   do {                                          \
1320     if(src->name) {                             \
1321       dest->name = strdup(src->name);           \
1322       if(!dest->name)                           \
1323         goto fail;                              \
1324     }                                           \
1325   } while(0)
1326 
curl_url_dup(const CURLU * in)1327 CURLU *curl_url_dup(const CURLU *in)
1328 {
1329   struct Curl_URL *u = calloc(1, sizeof(struct Curl_URL));
1330   if(u) {
1331     DUP(u, in, scheme);
1332     DUP(u, in, user);
1333     DUP(u, in, password);
1334     DUP(u, in, options);
1335     DUP(u, in, host);
1336     DUP(u, in, port);
1337     DUP(u, in, path);
1338     DUP(u, in, query);
1339     DUP(u, in, fragment);
1340     DUP(u, in, zoneid);
1341     u->portnum = in->portnum;
1342     u->fragment_present = in->fragment_present;
1343     u->query_present = in->query_present;
1344   }
1345   return u;
1346 fail:
1347   curl_url_cleanup(u);
1348   return NULL;
1349 }
1350 
curl_url_get(const CURLU * u,CURLUPart what,char ** part,unsigned int flags)1351 CURLUcode curl_url_get(const CURLU *u, CURLUPart what,
1352                        char **part, unsigned int flags)
1353 {
1354   const char *ptr;
1355   CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
1356   char portbuf[7];
1357   bool urldecode = (flags & CURLU_URLDECODE) ? 1 : 0;
1358   bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0;
1359   bool punycode = FALSE;
1360   bool depunyfy = FALSE;
1361   bool plusdecode = FALSE;
1362   (void)flags;
1363   if(!u)
1364     return CURLUE_BAD_HANDLE;
1365   if(!part)
1366     return CURLUE_BAD_PARTPOINTER;
1367   *part = NULL;
1368 
1369   switch(what) {
1370   case CURLUPART_SCHEME:
1371     ptr = u->scheme;
1372     ifmissing = CURLUE_NO_SCHEME;
1373     urldecode = FALSE; /* never for schemes */
1374     if((flags & CURLU_NO_GUESS_SCHEME) && u->guessed_scheme)
1375       return CURLUE_NO_SCHEME;
1376     break;
1377   case CURLUPART_USER:
1378     ptr = u->user;
1379     ifmissing = CURLUE_NO_USER;
1380     break;
1381   case CURLUPART_PASSWORD:
1382     ptr = u->password;
1383     ifmissing = CURLUE_NO_PASSWORD;
1384     break;
1385   case CURLUPART_OPTIONS:
1386     ptr = u->options;
1387     ifmissing = CURLUE_NO_OPTIONS;
1388     break;
1389   case CURLUPART_HOST:
1390     ptr = u->host;
1391     ifmissing = CURLUE_NO_HOST;
1392     punycode = (flags & CURLU_PUNYCODE) ? 1 : 0;
1393     depunyfy = (flags & CURLU_PUNY2IDN) ? 1 : 0;
1394     break;
1395   case CURLUPART_ZONEID:
1396     ptr = u->zoneid;
1397     ifmissing = CURLUE_NO_ZONEID;
1398     break;
1399   case CURLUPART_PORT:
1400     ptr = u->port;
1401     ifmissing = CURLUE_NO_PORT;
1402     urldecode = FALSE; /* never for port */
1403     if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1404       /* there is no stored port number, but asked to deliver
1405          a default one for the scheme */
1406       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1407       if(h) {
1408         msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1409         ptr = portbuf;
1410       }
1411     }
1412     else if(ptr && u->scheme) {
1413       /* there is a stored port number, but ask to inhibit if
1414          it matches the default one for the scheme */
1415       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1416       if(h && (h->defport == u->portnum) &&
1417          (flags & CURLU_NO_DEFAULT_PORT))
1418         ptr = NULL;
1419     }
1420     break;
1421   case CURLUPART_PATH:
1422     ptr = u->path;
1423     if(!ptr)
1424       ptr = "/";
1425     break;
1426   case CURLUPART_QUERY:
1427     ptr = u->query;
1428     ifmissing = CURLUE_NO_QUERY;
1429     plusdecode = urldecode;
1430     if(ptr && !ptr[0] && !(flags & CURLU_GET_EMPTY))
1431       /* there was a blank query and the user do not ask for it */
1432       ptr = NULL;
1433     break;
1434   case CURLUPART_FRAGMENT:
1435     ptr = u->fragment;
1436     ifmissing = CURLUE_NO_FRAGMENT;
1437     if(!ptr && u->fragment_present && flags & CURLU_GET_EMPTY)
1438       /* there was a blank fragment and the user asks for it */
1439       ptr = "";
1440     break;
1441   case CURLUPART_URL: {
1442     char *url;
1443     char *scheme;
1444     char *options = u->options;
1445     char *port = u->port;
1446     char *allochost = NULL;
1447     bool show_fragment =
1448       u->fragment || (u->fragment_present && flags & CURLU_GET_EMPTY);
1449     bool show_query =
1450       (u->query && u->query[0]) ||
1451       (u->query_present && flags & CURLU_GET_EMPTY);
1452     punycode = (flags & CURLU_PUNYCODE) ? 1 : 0;
1453     depunyfy = (flags & CURLU_PUNY2IDN) ? 1 : 0;
1454     if(u->scheme && strcasecompare("file", u->scheme)) {
1455       url = aprintf("file://%s%s%s",
1456                     u->path,
1457                     show_fragment ? "#": "",
1458                     u->fragment ? u->fragment : "");
1459     }
1460     else if(!u->host)
1461       return CURLUE_NO_HOST;
1462     else {
1463       const struct Curl_handler *h = NULL;
1464       char schemebuf[MAX_SCHEME_LEN + 5];
1465       if(u->scheme)
1466         scheme = u->scheme;
1467       else if(flags & CURLU_DEFAULT_SCHEME)
1468         scheme = (char *) DEFAULT_SCHEME;
1469       else
1470         return CURLUE_NO_SCHEME;
1471 
1472       h = Curl_get_scheme_handler(scheme);
1473       if(!port && (flags & CURLU_DEFAULT_PORT)) {
1474         /* there is no stored port number, but asked to deliver
1475            a default one for the scheme */
1476         if(h) {
1477           msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1478           port = portbuf;
1479         }
1480       }
1481       else if(port) {
1482         /* there is a stored port number, but asked to inhibit if it matches
1483            the default one for the scheme */
1484         if(h && (h->defport == u->portnum) &&
1485            (flags & CURLU_NO_DEFAULT_PORT))
1486           port = NULL;
1487       }
1488 
1489       if(h && !(h->flags & PROTOPT_URLOPTIONS))
1490         options = NULL;
1491 
1492       if(u->host[0] == '[') {
1493         if(u->zoneid) {
1494           /* make it '[ host %25 zoneid ]' */
1495           struct dynbuf enc;
1496           size_t hostlen = strlen(u->host);
1497           Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1498           if(Curl_dyn_addf(&enc, "%.*s%%25%s]", (int)hostlen - 1, u->host,
1499                            u->zoneid))
1500             return CURLUE_OUT_OF_MEMORY;
1501           allochost = Curl_dyn_ptr(&enc);
1502         }
1503       }
1504       else if(urlencode) {
1505         allochost = curl_easy_escape(NULL, u->host, 0);
1506         if(!allochost)
1507           return CURLUE_OUT_OF_MEMORY;
1508       }
1509       else if(punycode) {
1510         if(!Curl_is_ASCII_name(u->host)) {
1511 #ifndef USE_IDN
1512           return CURLUE_LACKS_IDN;
1513 #else
1514           CURLcode result = Curl_idn_decode(u->host, &allochost);
1515           if(result)
1516             return (result == CURLE_OUT_OF_MEMORY) ?
1517               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1518 #endif
1519         }
1520       }
1521       else if(depunyfy) {
1522         if(Curl_is_ASCII_name(u->host) && !strncmp("xn--", u->host, 4)) {
1523 #ifndef USE_IDN
1524           return CURLUE_LACKS_IDN;
1525 #else
1526           CURLcode result = Curl_idn_encode(u->host, &allochost);
1527           if(result)
1528             /* this is the most likely error */
1529             return (result == CURLE_OUT_OF_MEMORY) ?
1530               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1531 #endif
1532         }
1533       }
1534 
1535       if(!(flags & CURLU_NO_GUESS_SCHEME) || !u->guessed_scheme)
1536         msnprintf(schemebuf, sizeof(schemebuf), "%s://", scheme);
1537       else
1538         schemebuf[0] = 0;
1539 
1540       url = aprintf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1541                     schemebuf,
1542                     u->user ? u->user : "",
1543                     u->password ? ":": "",
1544                     u->password ? u->password : "",
1545                     options ? ";" : "",
1546                     options ? options : "",
1547                     (u->user || u->password || options) ? "@": "",
1548                     allochost ? allochost : u->host,
1549                     port ? ":": "",
1550                     port ? port : "",
1551                     u->path ? u->path : "/",
1552                     show_query ? "?": "",
1553                     u->query ? u->query : "",
1554                     show_fragment ? "#": "",
1555                     u->fragment ? u->fragment : "");
1556       free(allochost);
1557     }
1558     if(!url)
1559       return CURLUE_OUT_OF_MEMORY;
1560     *part = url;
1561     return CURLUE_OK;
1562   }
1563   default:
1564     ptr = NULL;
1565     break;
1566   }
1567   if(ptr) {
1568     size_t partlen = strlen(ptr);
1569     size_t i = 0;
1570     *part = Curl_memdup0(ptr, partlen);
1571     if(!*part)
1572       return CURLUE_OUT_OF_MEMORY;
1573     if(plusdecode) {
1574       /* convert + to space */
1575       char *plus = *part;
1576       for(i = 0; i < partlen; ++plus, i++) {
1577         if(*plus == '+')
1578           *plus = ' ';
1579       }
1580     }
1581     if(urldecode) {
1582       char *decoded;
1583       size_t dlen;
1584       /* this unconditional rejection of control bytes is documented
1585          API behavior */
1586       CURLcode res = Curl_urldecode(*part, 0, &decoded, &dlen, REJECT_CTRL);
1587       free(*part);
1588       if(res) {
1589         *part = NULL;
1590         return CURLUE_URLDECODE;
1591       }
1592       *part = decoded;
1593       partlen = dlen;
1594     }
1595     if(urlencode) {
1596       struct dynbuf enc;
1597       CURLUcode uc;
1598       Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1599       uc = urlencode_str(&enc, *part, partlen, TRUE, what == CURLUPART_QUERY);
1600       if(uc)
1601         return uc;
1602       free(*part);
1603       *part = Curl_dyn_ptr(&enc);
1604     }
1605     else if(punycode) {
1606       if(!Curl_is_ASCII_name(u->host)) {
1607 #ifndef USE_IDN
1608         return CURLUE_LACKS_IDN;
1609 #else
1610         char *allochost;
1611         CURLcode result = Curl_idn_decode(*part, &allochost);
1612         if(result)
1613           return (result == CURLE_OUT_OF_MEMORY) ?
1614             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1615         free(*part);
1616         *part = allochost;
1617 #endif
1618       }
1619     }
1620     else if(depunyfy) {
1621       if(Curl_is_ASCII_name(u->host)  && !strncmp("xn--", u->host, 4)) {
1622 #ifndef USE_IDN
1623         return CURLUE_LACKS_IDN;
1624 #else
1625         char *allochost;
1626         CURLcode result = Curl_idn_encode(*part, &allochost);
1627         if(result)
1628           return (result == CURLE_OUT_OF_MEMORY) ?
1629             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1630         free(*part);
1631         *part = allochost;
1632 #endif
1633       }
1634     }
1635 
1636     return CURLUE_OK;
1637   }
1638   else
1639     return ifmissing;
1640 }
1641 
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1642 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1643                        const char *part, unsigned int flags)
1644 {
1645   char **storep = NULL;
1646   bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0;
1647   bool plusencode = FALSE;
1648   bool urlskipslash = FALSE;
1649   bool leadingslash = FALSE;
1650   bool appendquery = FALSE;
1651   bool equalsencode = FALSE;
1652   size_t nalloc;
1653 
1654   if(!u)
1655     return CURLUE_BAD_HANDLE;
1656   if(!part) {
1657     /* setting a part to NULL clears it */
1658     switch(what) {
1659     case CURLUPART_URL:
1660       break;
1661     case CURLUPART_SCHEME:
1662       storep = &u->scheme;
1663       u->guessed_scheme = FALSE;
1664       break;
1665     case CURLUPART_USER:
1666       storep = &u->user;
1667       break;
1668     case CURLUPART_PASSWORD:
1669       storep = &u->password;
1670       break;
1671     case CURLUPART_OPTIONS:
1672       storep = &u->options;
1673       break;
1674     case CURLUPART_HOST:
1675       storep = &u->host;
1676       break;
1677     case CURLUPART_ZONEID:
1678       storep = &u->zoneid;
1679       break;
1680     case CURLUPART_PORT:
1681       u->portnum = 0;
1682       storep = &u->port;
1683       break;
1684     case CURLUPART_PATH:
1685       storep = &u->path;
1686       break;
1687     case CURLUPART_QUERY:
1688       storep = &u->query;
1689       u->query_present = FALSE;
1690       break;
1691     case CURLUPART_FRAGMENT:
1692       storep = &u->fragment;
1693       u->fragment_present = FALSE;
1694       break;
1695     default:
1696       return CURLUE_UNKNOWN_PART;
1697     }
1698     if(storep && *storep) {
1699       Curl_safefree(*storep);
1700     }
1701     else if(!storep) {
1702       free_urlhandle(u);
1703       memset(u, 0, sizeof(struct Curl_URL));
1704     }
1705     return CURLUE_OK;
1706   }
1707 
1708   nalloc = strlen(part);
1709   if(nalloc > CURL_MAX_INPUT_LENGTH)
1710     /* excessive input length */
1711     return CURLUE_MALFORMED_INPUT;
1712 
1713   switch(what) {
1714   case CURLUPART_SCHEME: {
1715     size_t plen = strlen(part);
1716     const char *s = part;
1717     if((plen > MAX_SCHEME_LEN) || (plen < 1))
1718       /* too long or too short */
1719       return CURLUE_BAD_SCHEME;
1720    /* verify that it is a fine scheme */
1721     if(!(flags & CURLU_NON_SUPPORT_SCHEME) && !Curl_get_scheme_handler(part))
1722       return CURLUE_UNSUPPORTED_SCHEME;
1723     storep = &u->scheme;
1724     urlencode = FALSE; /* never */
1725     if(ISALPHA(*s)) {
1726       /* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
1727       while(--plen) {
1728         if(ISALNUM(*s) || (*s == '+') || (*s == '-') || (*s == '.'))
1729           s++; /* fine */
1730         else
1731           return CURLUE_BAD_SCHEME;
1732       }
1733     }
1734     else
1735       return CURLUE_BAD_SCHEME;
1736     u->guessed_scheme = FALSE;
1737     break;
1738   }
1739   case CURLUPART_USER:
1740     storep = &u->user;
1741     break;
1742   case CURLUPART_PASSWORD:
1743     storep = &u->password;
1744     break;
1745   case CURLUPART_OPTIONS:
1746     storep = &u->options;
1747     break;
1748   case CURLUPART_HOST:
1749     storep = &u->host;
1750     Curl_safefree(u->zoneid);
1751     break;
1752   case CURLUPART_ZONEID:
1753     storep = &u->zoneid;
1754     break;
1755   case CURLUPART_PORT:
1756     if(!ISDIGIT(part[0]))
1757       /* not a number */
1758       return CURLUE_BAD_PORT_NUMBER;
1759     else {
1760       char *tmp;
1761       char *endp;
1762       unsigned long port;
1763       errno = 0;
1764       port = strtoul(part, &endp, 10);  /* must be decimal */
1765       if(errno || (port > 0xffff) || *endp)
1766         /* weirdly provided number, not good! */
1767         return CURLUE_BAD_PORT_NUMBER;
1768       tmp = strdup(part);
1769       if(!tmp)
1770         return CURLUE_OUT_OF_MEMORY;
1771       free(u->port);
1772       u->port = tmp;
1773       u->portnum = (unsigned short)port;
1774       return CURLUE_OK;
1775     }
1776   case CURLUPART_PATH:
1777     urlskipslash = TRUE;
1778     leadingslash = TRUE; /* enforce */
1779     storep = &u->path;
1780     break;
1781   case CURLUPART_QUERY:
1782     plusencode = urlencode;
1783     appendquery = (flags & CURLU_APPENDQUERY) ? 1 : 0;
1784     equalsencode = appendquery;
1785     storep = &u->query;
1786     u->query_present = TRUE;
1787     break;
1788   case CURLUPART_FRAGMENT:
1789     storep = &u->fragment;
1790     u->fragment_present = TRUE;
1791     break;
1792   case CURLUPART_URL: {
1793     /*
1794      * Allow a new URL to replace the existing (if any) contents.
1795      *
1796      * If the existing contents is enough for a URL, allow a relative URL to
1797      * replace it.
1798      */
1799     CURLUcode uc;
1800     char *oldurl;
1801 
1802     if(!nalloc)
1803       /* a blank URL is not a valid URL */
1804       return CURLUE_MALFORMED_INPUT;
1805 
1806     /* if the new thing is absolute or the old one is not (we could not get an
1807      * absolute URL in 'oldurl'), then replace the existing with the new. */
1808     if(Curl_is_absolute_url(part, NULL, 0,
1809                             flags & (CURLU_GUESS_SCHEME|CURLU_DEFAULT_SCHEME))
1810        || curl_url_get(u, CURLUPART_URL, &oldurl, flags)) {
1811       return parseurl_and_replace(part, u, flags);
1812     }
1813 
1814     /* apply the relative part to create a new URL */
1815     uc = redirect_url(oldurl, part, u, flags);
1816     free(oldurl);
1817     return uc;
1818   }
1819   default:
1820     return CURLUE_UNKNOWN_PART;
1821   }
1822   DEBUGASSERT(storep);
1823   {
1824     const char *newp;
1825     struct dynbuf enc;
1826     Curl_dyn_init(&enc, nalloc * 3 + 1 + leadingslash);
1827 
1828     if(leadingslash && (part[0] != '/')) {
1829       CURLcode result = Curl_dyn_addn(&enc, "/", 1);
1830       if(result)
1831         return cc2cu(result);
1832     }
1833     if(urlencode) {
1834       const unsigned char *i;
1835 
1836       for(i = (const unsigned char *)part; *i; i++) {
1837         CURLcode result;
1838         if((*i == ' ') && plusencode) {
1839           result = Curl_dyn_addn(&enc, "+", 1);
1840           if(result)
1841             return CURLUE_OUT_OF_MEMORY;
1842         }
1843         else if(ISUNRESERVED(*i) ||
1844                 ((*i == '/') && urlskipslash) ||
1845                 ((*i == '=') && equalsencode)) {
1846           if((*i == '=') && equalsencode)
1847             /* only skip the first equals sign */
1848             equalsencode = FALSE;
1849           result = Curl_dyn_addn(&enc, i, 1);
1850           if(result)
1851             return cc2cu(result);
1852         }
1853         else {
1854           char out[3]={'%'};
1855           out[1] = hexdigits[*i >> 4];
1856           out[2] = hexdigits[*i & 0xf];
1857           result = Curl_dyn_addn(&enc, out, 3);
1858           if(result)
1859             return cc2cu(result);
1860         }
1861       }
1862     }
1863     else {
1864       char *p;
1865       CURLcode result = Curl_dyn_add(&enc, part);
1866       if(result)
1867         return cc2cu(result);
1868       p = Curl_dyn_ptr(&enc);
1869       while(*p) {
1870         /* make sure percent encoded are lower case */
1871         if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1872            (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1873           p[1] = Curl_raw_tolower(p[1]);
1874           p[2] = Curl_raw_tolower(p[2]);
1875           p += 3;
1876         }
1877         else
1878           p++;
1879       }
1880     }
1881     newp = Curl_dyn_ptr(&enc);
1882 
1883     if(appendquery && newp) {
1884       /* Append the 'newp' string onto the old query. Add a '&' separator if
1885          none is present at the end of the existing query already */
1886 
1887       size_t querylen = u->query ? strlen(u->query) : 0;
1888       bool addamperand = querylen && (u->query[querylen -1] != '&');
1889       if(querylen) {
1890         struct dynbuf qbuf;
1891         Curl_dyn_init(&qbuf, CURL_MAX_INPUT_LENGTH);
1892 
1893         if(Curl_dyn_addn(&qbuf, u->query, querylen)) /* add original query */
1894           goto nomem;
1895 
1896         if(addamperand) {
1897           if(Curl_dyn_addn(&qbuf, "&", 1))
1898             goto nomem;
1899         }
1900         if(Curl_dyn_add(&qbuf, newp))
1901           goto nomem;
1902         Curl_dyn_free(&enc);
1903         free(*storep);
1904         *storep = Curl_dyn_ptr(&qbuf);
1905         return CURLUE_OK;
1906 nomem:
1907         Curl_dyn_free(&enc);
1908         return CURLUE_OUT_OF_MEMORY;
1909       }
1910     }
1911 
1912     else if(what == CURLUPART_HOST) {
1913       size_t n = Curl_dyn_len(&enc);
1914       if(!n && (flags & CURLU_NO_AUTHORITY)) {
1915         /* Skip hostname check, it is allowed to be empty. */
1916       }
1917       else {
1918         bool bad = FALSE;
1919         if(!n)
1920           bad = TRUE; /* empty hostname is not okay */
1921         else if(!urlencode) {
1922           /* if the host name part was not URL encoded here, it was set ready
1923              URL encoded so we need to decode it to check */
1924           size_t dlen;
1925           char *decoded = NULL;
1926           CURLcode result =
1927             Curl_urldecode(newp, n, &decoded, &dlen, REJECT_CTRL);
1928           if(result || hostname_check(u, decoded, dlen))
1929             bad = TRUE;
1930           free(decoded);
1931         }
1932         else if(hostname_check(u, (char *)newp, n))
1933           bad = TRUE;
1934         if(bad) {
1935           Curl_dyn_free(&enc);
1936           return CURLUE_BAD_HOSTNAME;
1937         }
1938       }
1939     }
1940 
1941     free(*storep);
1942     *storep = (char *)newp;
1943   }
1944   return CURLUE_OK;
1945 }
1946