1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2020, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.haxx.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22
23 #include "curl_setup.h"
24
25 #include "urldata.h"
26 #include "urlapi-int.h"
27 #include "strcase.h"
28 #include "dotdot.h"
29 #include "url.h"
30 #include "escape.h"
31 #include "curl_ctype.h"
32 #include "inet_pton.h"
33
34 /* The last 3 #include files should be in this order */
35 #include "curl_printf.h"
36 #include "curl_memory.h"
37 #include "memdebug.h"
38
39 /* MSDOS/Windows style drive prefix, eg c: in c:foo */
40 #define STARTS_WITH_DRIVE_PREFIX(str) \
41 ((('a' <= str[0] && str[0] <= 'z') || \
42 ('A' <= str[0] && str[0] <= 'Z')) && \
43 (str[1] == ':'))
44
45 /* MSDOS/Windows style drive prefix, optionally with
46 * a '|' instead of ':', followed by a slash or NUL */
47 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
48 ((('a' <= (str)[0] && (str)[0] <= 'z') || \
49 ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
50 ((str)[1] == ':' || (str)[1] == '|') && \
51 ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
52
53 /* Internal representation of CURLU. Point to URL-encoded strings. */
54 struct Curl_URL {
55 char *scheme;
56 char *user;
57 char *password;
58 char *options; /* IMAP only? */
59 char *host;
60 char *zoneid; /* for numerical IPv6 addresses */
61 char *port;
62 char *path;
63 char *query;
64 char *fragment;
65
66 char *scratch; /* temporary scratch area */
67 char *temppath; /* temporary path pointer */
68 long portnum; /* the numerical version */
69 };
70
71 #define DEFAULT_SCHEME "https"
72
free_urlhandle(struct Curl_URL * u)73 static void free_urlhandle(struct Curl_URL *u)
74 {
75 free(u->scheme);
76 free(u->user);
77 free(u->password);
78 free(u->options);
79 free(u->host);
80 free(u->zoneid);
81 free(u->port);
82 free(u->path);
83 free(u->query);
84 free(u->fragment);
85 free(u->scratch);
86 free(u->temppath);
87 }
88
89 /* move the full contents of one handle onto another and
90 free the original */
mv_urlhandle(struct Curl_URL * from,struct Curl_URL * to)91 static void mv_urlhandle(struct Curl_URL *from,
92 struct Curl_URL *to)
93 {
94 free_urlhandle(to);
95 *to = *from;
96 free(from);
97 }
98
99 /*
100 * Find the separator at the end of the host name, or the '?' in cases like
101 * http://www.url.com?id=2380
102 */
find_host_sep(const char * url)103 static const char *find_host_sep(const char *url)
104 {
105 const char *sep;
106 const char *query;
107
108 /* Find the start of the hostname */
109 sep = strstr(url, "//");
110 if(!sep)
111 sep = url;
112 else
113 sep += 2;
114
115 query = strchr(sep, '?');
116 sep = strchr(sep, '/');
117
118 if(!sep)
119 sep = url + strlen(url);
120
121 if(!query)
122 query = url + strlen(url);
123
124 return sep < query ? sep : query;
125 }
126
127 /*
128 * Decide in an encoding-independent manner whether a character in an
129 * URL must be escaped. The same criterion must be used in strlen_url()
130 * and strcpy_url().
131 */
urlchar_needs_escaping(int c)132 static bool urlchar_needs_escaping(int c)
133 {
134 return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
135 }
136
137 /*
138 * strlen_url() returns the length of the given URL if the spaces within the
139 * URL were properly URL encoded.
140 * URL encoding should be skipped for host names, otherwise IDN resolution
141 * will fail.
142 */
strlen_url(const char * url,bool relative)143 static size_t strlen_url(const char *url, bool relative)
144 {
145 const unsigned char *ptr;
146 size_t newlen = 0;
147 bool left = TRUE; /* left side of the ? */
148 const unsigned char *host_sep = (const unsigned char *) url;
149
150 if(!relative)
151 host_sep = (const unsigned char *) find_host_sep(url);
152
153 for(ptr = (unsigned char *)url; *ptr; ptr++) {
154
155 if(ptr < host_sep) {
156 ++newlen;
157 continue;
158 }
159
160 switch(*ptr) {
161 case '?':
162 left = FALSE;
163 /* FALLTHROUGH */
164 default:
165 if(urlchar_needs_escaping(*ptr))
166 newlen += 2;
167 newlen++;
168 break;
169 case ' ':
170 if(left)
171 newlen += 3;
172 else
173 newlen++;
174 break;
175 }
176 }
177 return newlen;
178 }
179
180 /* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
181 * the source URL accordingly.
182 * URL encoding should be skipped for host names, otherwise IDN resolution
183 * will fail.
184 */
strcpy_url(char * output,const char * url,bool relative)185 static void strcpy_url(char *output, const char *url, bool relative)
186 {
187 /* we must add this with whitespace-replacing */
188 bool left = TRUE;
189 const unsigned char *iptr;
190 char *optr = output;
191 const unsigned char *host_sep = (const unsigned char *) url;
192
193 if(!relative)
194 host_sep = (const unsigned char *) find_host_sep(url);
195
196 for(iptr = (unsigned char *)url; /* read from here */
197 *iptr; /* until zero byte */
198 iptr++) {
199
200 if(iptr < host_sep) {
201 *optr++ = *iptr;
202 continue;
203 }
204
205 switch(*iptr) {
206 case '?':
207 left = FALSE;
208 /* FALLTHROUGH */
209 default:
210 if(urlchar_needs_escaping(*iptr)) {
211 msnprintf(optr, 4, "%%%02x", *iptr);
212 optr += 3;
213 }
214 else
215 *optr++=*iptr;
216 break;
217 case ' ':
218 if(left) {
219 *optr++='%'; /* add a '%' */
220 *optr++='2'; /* add a '2' */
221 *optr++='0'; /* add a '0' */
222 }
223 else
224 *optr++='+'; /* add a '+' here */
225 break;
226 }
227 }
228 *optr = 0; /* null-terminate output buffer */
229
230 }
231
232 /*
233 * Returns true if the given URL is absolute (as opposed to relative) within
234 * the buffer size. Returns the scheme in the buffer if TRUE and 'buf' is
235 * non-NULL.
236 */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen)237 bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
238 {
239 size_t i;
240 #ifdef WIN32
241 if(STARTS_WITH_DRIVE_PREFIX(url))
242 return FALSE;
243 #endif
244 for(i = 0; i < buflen && url[i]; ++i) {
245 char s = url[i];
246 if((s == ':') && (url[i + 1] == '/')) {
247 if(buf)
248 buf[i] = 0;
249 return TRUE;
250 }
251 /* RFC 3986 3.1 explains:
252 scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
253 */
254 else if(ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') ) {
255 if(buf)
256 buf[i] = (char)TOLOWER(s);
257 }
258 else
259 break;
260 }
261 return FALSE;
262 }
263
264 /*
265 * Concatenate a relative URL to a base URL making it absolute.
266 * URL-encodes any spaces.
267 * The returned pointer must be freed by the caller unless NULL
268 * (returns NULL on out of memory).
269 */
concat_url(const char * base,const char * relurl)270 static char *concat_url(const char *base, const char *relurl)
271 {
272 /***
273 TRY to append this new path to the old URL
274 to the right of the host part. Oh crap, this is doomed to cause
275 problems in the future...
276 */
277 char *newest;
278 char *protsep;
279 char *pathsep;
280 size_t newlen;
281 bool host_changed = FALSE;
282
283 const char *useurl = relurl;
284 size_t urllen;
285
286 /* we must make our own copy of the URL to play with, as it may
287 point to read-only data */
288 char *url_clone = strdup(base);
289
290 if(!url_clone)
291 return NULL; /* skip out of this NOW */
292
293 /* protsep points to the start of the host name */
294 protsep = strstr(url_clone, "//");
295 if(!protsep)
296 protsep = url_clone;
297 else
298 protsep += 2; /* pass the slashes */
299
300 if('/' != relurl[0]) {
301 int level = 0;
302
303 /* First we need to find out if there's a ?-letter in the URL,
304 and cut it and the right-side of that off */
305 pathsep = strchr(protsep, '?');
306 if(pathsep)
307 *pathsep = 0;
308
309 /* we have a relative path to append to the last slash if there's one
310 available, or if the new URL is just a query string (starts with a
311 '?') we append the new one at the end of the entire currently worked
312 out URL */
313 if(useurl[0] != '?') {
314 pathsep = strrchr(protsep, '/');
315 if(pathsep)
316 *pathsep = 0;
317 }
318
319 /* Check if there's any slash after the host name, and if so, remember
320 that position instead */
321 pathsep = strchr(protsep, '/');
322 if(pathsep)
323 protsep = pathsep + 1;
324 else
325 protsep = NULL;
326
327 /* now deal with one "./" or any amount of "../" in the newurl
328 and act accordingly */
329
330 if((useurl[0] == '.') && (useurl[1] == '/'))
331 useurl += 2; /* just skip the "./" */
332
333 while((useurl[0] == '.') &&
334 (useurl[1] == '.') &&
335 (useurl[2] == '/')) {
336 level++;
337 useurl += 3; /* pass the "../" */
338 }
339
340 if(protsep) {
341 while(level--) {
342 /* cut off one more level from the right of the original URL */
343 pathsep = strrchr(protsep, '/');
344 if(pathsep)
345 *pathsep = 0;
346 else {
347 *protsep = 0;
348 break;
349 }
350 }
351 }
352 }
353 else {
354 /* We got a new absolute path for this server */
355
356 if(relurl[1] == '/') {
357 /* the new URL starts with //, just keep the protocol part from the
358 original one */
359 *protsep = 0;
360 useurl = &relurl[2]; /* we keep the slashes from the original, so we
361 skip the new ones */
362 host_changed = TRUE;
363 }
364 else {
365 /* cut off the original URL from the first slash, or deal with URLs
366 without slash */
367 pathsep = strchr(protsep, '/');
368 if(pathsep) {
369 /* When people use badly formatted URLs, such as
370 "http://www.url.com?dir=/home/daniel" we must not use the first
371 slash, if there's a ?-letter before it! */
372 char *sep = strchr(protsep, '?');
373 if(sep && (sep < pathsep))
374 pathsep = sep;
375 *pathsep = 0;
376 }
377 else {
378 /* There was no slash. Now, since we might be operating on a badly
379 formatted URL, such as "http://www.url.com?id=2380" which doesn't
380 use a slash separator as it is supposed to, we need to check for a
381 ?-letter as well! */
382 pathsep = strchr(protsep, '?');
383 if(pathsep)
384 *pathsep = 0;
385 }
386 }
387 }
388
389 /* If the new part contains a space, this is a mighty stupid redirect
390 but we still make an effort to do "right". To the left of a '?'
391 letter we replace each space with %20 while it is replaced with '+'
392 on the right side of the '?' letter.
393 */
394 newlen = strlen_url(useurl, !host_changed);
395
396 urllen = strlen(url_clone);
397
398 newest = malloc(urllen + 1 + /* possible slash */
399 newlen + 1 /* zero byte */);
400
401 if(!newest) {
402 free(url_clone); /* don't leak this */
403 return NULL;
404 }
405
406 /* copy over the root url part */
407 memcpy(newest, url_clone, urllen);
408
409 /* check if we need to append a slash */
410 if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
411 ;
412 else
413 newest[urllen++]='/';
414
415 /* then append the new piece on the right side */
416 strcpy_url(&newest[urllen], useurl, !host_changed);
417
418 free(url_clone);
419
420 return newest;
421 }
422
423 /*
424 * parse_hostname_login()
425 *
426 * Parse the login details (user name, password and options) from the URL and
427 * strip them out of the host name
428 *
429 */
parse_hostname_login(struct Curl_URL * u,char ** hostname,unsigned int flags)430 static CURLUcode parse_hostname_login(struct Curl_URL *u,
431 char **hostname,
432 unsigned int flags)
433 {
434 CURLUcode result = CURLUE_OK;
435 CURLcode ccode;
436 char *userp = NULL;
437 char *passwdp = NULL;
438 char *optionsp = NULL;
439 const struct Curl_handler *h = NULL;
440
441 /* At this point, we're hoping all the other special cases have
442 * been taken care of, so conn->host.name is at most
443 * [user[:password][;options]]@]hostname
444 *
445 * We need somewhere to put the embedded details, so do that first.
446 */
447
448 char *ptr = strchr(*hostname, '@');
449 char *login = *hostname;
450
451 if(!ptr)
452 goto out;
453
454 /* We will now try to extract the
455 * possible login information in a string like:
456 * ftp://user:password@ftp.my.site:8021/README */
457 *hostname = ++ptr;
458
459 /* if this is a known scheme, get some details */
460 if(u->scheme)
461 h = Curl_builtin_scheme(u->scheme);
462
463 /* We could use the login information in the URL so extract it. Only parse
464 options if the handler says we should. Note that 'h' might be NULL! */
465 ccode = Curl_parse_login_details(login, ptr - login - 1,
466 &userp, &passwdp,
467 (h && (h->flags & PROTOPT_URLOPTIONS)) ?
468 &optionsp:NULL);
469 if(ccode) {
470 result = CURLUE_MALFORMED_INPUT;
471 goto out;
472 }
473
474 if(userp) {
475 if(flags & CURLU_DISALLOW_USER) {
476 /* Option DISALLOW_USER is set and url contains username. */
477 result = CURLUE_USER_NOT_ALLOWED;
478 goto out;
479 }
480
481 u->user = userp;
482 }
483
484 if(passwdp)
485 u->password = passwdp;
486
487 if(optionsp)
488 u->options = optionsp;
489
490 return CURLUE_OK;
491 out:
492
493 free(userp);
494 free(passwdp);
495 free(optionsp);
496
497 return result;
498 }
499
Curl_parse_port(struct Curl_URL * u,char * hostname)500 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname)
501 {
502 char *portptr = NULL;
503 char endbracket;
504 int len;
505
506 /*
507 * Find the end of an IPv6 address, either on the ']' ending bracket or
508 * a percent-encoded zone index.
509 */
510 if(1 == sscanf(hostname, "[%*45[0123456789abcdefABCDEF:.]%c%n",
511 &endbracket, &len)) {
512 if(']' == endbracket)
513 portptr = &hostname[len];
514 else if('%' == endbracket) {
515 int zonelen = len;
516 if(1 == sscanf(hostname + zonelen, "%*[^]]%c%n", &endbracket, &len)) {
517 if(']' != endbracket)
518 return CURLUE_MALFORMED_INPUT;
519 portptr = &hostname[--zonelen + len + 1];
520 }
521 else
522 return CURLUE_MALFORMED_INPUT;
523 }
524 else
525 return CURLUE_MALFORMED_INPUT;
526
527 /* this is a RFC2732-style specified IP-address */
528 if(portptr && *portptr) {
529 if(*portptr != ':')
530 return CURLUE_MALFORMED_INPUT;
531 }
532 else
533 portptr = NULL;
534 }
535 else
536 portptr = strchr(hostname, ':');
537
538 if(portptr) {
539 char *rest;
540 long port;
541 char portbuf[7];
542
543 /* Browser behavior adaptation. If there's a colon with no digits after,
544 just cut off the name there which makes us ignore the colon and just
545 use the default port. Firefox, Chrome and Safari all do that. */
546 if(!portptr[1]) {
547 *portptr = '\0';
548 return CURLUE_OK;
549 }
550
551 if(!ISDIGIT(portptr[1]))
552 return CURLUE_BAD_PORT_NUMBER;
553
554 port = strtol(portptr + 1, &rest, 10); /* Port number must be decimal */
555
556 if((port <= 0) || (port > 0xffff))
557 /* Single unix standard says port numbers are 16 bits long, but we don't
558 treat port zero as OK. */
559 return CURLUE_BAD_PORT_NUMBER;
560
561 if(rest[0])
562 return CURLUE_BAD_PORT_NUMBER;
563
564 *portptr++ = '\0'; /* cut off the name there */
565 *rest = 0;
566 /* generate a new port number string to get rid of leading zeroes etc */
567 msnprintf(portbuf, sizeof(portbuf), "%ld", port);
568 u->portnum = port;
569 u->port = strdup(portbuf);
570 if(!u->port)
571 return CURLUE_OUT_OF_MEMORY;
572 }
573
574 return CURLUE_OK;
575 }
576
577 /* scan for byte values < 31 or 127 */
junkscan(const char * part)578 static CURLUcode junkscan(const char *part)
579 {
580 if(part) {
581 static const char badbytes[]={
582 /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
583 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
584 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
585 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
586 0x7f,
587 0x00 /* null-terminate */
588 };
589 size_t n = strlen(part);
590 size_t nfine = strcspn(part, badbytes);
591 if(nfine != n)
592 /* since we don't know which part is scanned, return a generic error
593 code */
594 return CURLUE_MALFORMED_INPUT;
595 }
596 return CURLUE_OK;
597 }
598
hostname_check(struct Curl_URL * u,char * hostname)599 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname)
600 {
601 size_t len;
602 size_t hlen = strlen(hostname);
603
604 if(hostname[0] == '[') {
605 #ifdef ENABLE_IPV6
606 char dest[16]; /* fits a binary IPv6 address */
607 #endif
608 const char *l = "0123456789abcdefABCDEF:.";
609 if(hlen < 4) /* '[::]' is the shortest possible valid string */
610 return CURLUE_MALFORMED_INPUT;
611 hostname++;
612 hlen -= 2;
613
614 if(hostname[hlen] != ']')
615 return CURLUE_MALFORMED_INPUT;
616
617 /* only valid letters are ok */
618 len = strspn(hostname, l);
619 if(hlen != len) {
620 hlen = len;
621 if(hostname[len] == '%') {
622 /* this could now be '%[zone id]' */
623 char zoneid[16];
624 int i = 0;
625 char *h = &hostname[len + 1];
626 /* pass '25' if present and is a url encoded percent sign */
627 if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
628 h += 2;
629 while(*h && (*h != ']') && (i < 15))
630 zoneid[i++] = *h++;
631 if(!i || (']' != *h))
632 return CURLUE_MALFORMED_INPUT;
633 zoneid[i] = 0;
634 u->zoneid = strdup(zoneid);
635 if(!u->zoneid)
636 return CURLUE_OUT_OF_MEMORY;
637 hostname[len] = ']'; /* insert end bracket */
638 hostname[len + 1] = 0; /* terminate the hostname */
639 }
640 else
641 return CURLUE_MALFORMED_INPUT;
642 /* hostname is fine */
643 }
644 #ifdef ENABLE_IPV6
645 hostname[hlen] = 0; /* end the address there */
646 if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
647 return CURLUE_MALFORMED_INPUT;
648 hostname[hlen] = ']'; /* restore ending bracket */
649 #endif
650 }
651 else {
652 /* letters from the second string is not ok */
653 len = strcspn(hostname, " ");
654 if(hlen != len)
655 /* hostname with bad content */
656 return CURLUE_MALFORMED_INPUT;
657 }
658 if(!hostname[0])
659 return CURLUE_NO_HOST;
660 return CURLUE_OK;
661 }
662
663 #define HOSTNAME_END(x) (((x) == '/') || ((x) == '?') || ((x) == '#'))
664
seturl(const char * url,CURLU * u,unsigned int flags)665 static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
666 {
667 char *path;
668 bool path_alloced = FALSE;
669 char *hostname;
670 char *query = NULL;
671 char *fragment = NULL;
672 CURLUcode result;
673 bool url_has_scheme = FALSE;
674 char schemebuf[MAX_SCHEME_LEN + 1];
675 const char *schemep = NULL;
676 size_t schemelen = 0;
677 size_t urllen;
678
679 if(!url)
680 return CURLUE_MALFORMED_INPUT;
681
682 /*************************************************************
683 * Parse the URL.
684 ************************************************************/
685 /* allocate scratch area */
686 urllen = strlen(url);
687 if(urllen > CURL_MAX_INPUT_LENGTH)
688 /* excessive input length */
689 return CURLUE_MALFORMED_INPUT;
690
691 path = u->scratch = malloc(urllen * 2 + 2);
692 if(!path)
693 return CURLUE_OUT_OF_MEMORY;
694
695 hostname = &path[urllen + 1];
696 hostname[0] = 0;
697
698 if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
699 url_has_scheme = TRUE;
700 schemelen = strlen(schemebuf);
701 }
702
703 /* handle the file: scheme */
704 if(url_has_scheme && strcasecompare(schemebuf, "file")) {
705 /* path has been allocated large enough to hold this */
706 strcpy(path, &url[5]);
707
708 hostname = NULL; /* no host for file: URLs */
709 u->scheme = strdup("file");
710 if(!u->scheme)
711 return CURLUE_OUT_OF_MEMORY;
712
713 /* Extra handling URLs with an authority component (i.e. that start with
714 * "file://")
715 *
716 * We allow omitted hostname (e.g. file:/<path>) -- valid according to
717 * RFC 8089, but not the (current) WHAT-WG URL spec.
718 */
719 if(path[0] == '/' && path[1] == '/') {
720 /* swallow the two slashes */
721 char *ptr = &path[2];
722
723 /*
724 * According to RFC 8089, a file: URL can be reliably dereferenced if:
725 *
726 * o it has no/blank hostname, or
727 *
728 * o the hostname matches "localhost" (case-insensitively), or
729 *
730 * o the hostname is a FQDN that resolves to this machine.
731 *
732 * For brevity, we only consider URLs with empty, "localhost", or
733 * "127.0.0.1" hostnames as local.
734 *
735 * Additionally, there is an exception for URLs with a Windows drive
736 * letter in the authority (which was accidentally omitted from RFC 8089
737 * Appendix E, but believe me, it was meant to be there. --MK)
738 */
739 if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
740 /* the URL includes a host name, it must match "localhost" or
741 "127.0.0.1" to be valid */
742 if(!checkprefix("localhost/", ptr) &&
743 !checkprefix("127.0.0.1/", ptr)) {
744 /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
745 none */
746 return CURLUE_MALFORMED_INPUT;
747 }
748 ptr += 9; /* now points to the slash after the host */
749 }
750
751 path = ptr;
752 }
753
754 #if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
755 /* Don't allow Windows drive letters when not in Windows.
756 * This catches both "file:/c:" and "file:c:" */
757 if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
758 STARTS_WITH_URL_DRIVE_PREFIX(path)) {
759 /* File drive letters are only accepted in MSDOS/Windows */
760 return CURLUE_MALFORMED_INPUT;
761 }
762 #else
763 /* If the path starts with a slash and a drive letter, ditch the slash */
764 if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
765 /* This cannot be done with strcpy, as the memory chunks overlap! */
766 memmove(path, &path[1], strlen(&path[1]) + 1);
767 }
768 #endif
769
770 }
771 else {
772 /* clear path */
773 const char *p;
774 const char *hostp;
775 size_t len;
776 path[0] = 0;
777
778 if(url_has_scheme) {
779 int i = 0;
780 p = &url[schemelen + 1];
781 while(p && (*p == '/') && (i < 4)) {
782 p++;
783 i++;
784 }
785 if((i < 1) || (i>3))
786 /* less than one or more than three slashes */
787 return CURLUE_MALFORMED_INPUT;
788
789 schemep = schemebuf;
790 if(!Curl_builtin_scheme(schemep) &&
791 !(flags & CURLU_NON_SUPPORT_SCHEME))
792 return CURLUE_UNSUPPORTED_SCHEME;
793
794 if(junkscan(schemep))
795 return CURLUE_MALFORMED_INPUT;
796
797 }
798 else {
799 /* no scheme! */
800
801 if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
802 return CURLUE_MALFORMED_INPUT;
803 if(flags & CURLU_DEFAULT_SCHEME)
804 schemep = DEFAULT_SCHEME;
805
806 /*
807 * The URL was badly formatted, let's try without scheme specified.
808 */
809 p = url;
810 }
811 hostp = p; /* host name starts here */
812
813 while(*p && !HOSTNAME_END(*p)) /* find end of host name */
814 p++;
815
816 len = p - hostp;
817 if(len) {
818 memcpy(hostname, hostp, len);
819 hostname[len] = 0;
820 }
821 else {
822 if(!(flags & CURLU_NO_AUTHORITY))
823 return CURLUE_MALFORMED_INPUT;
824 }
825
826 len = strlen(p);
827 memcpy(path, p, len);
828 path[len] = 0;
829
830 if(schemep) {
831 u->scheme = strdup(schemep);
832 if(!u->scheme)
833 return CURLUE_OUT_OF_MEMORY;
834 }
835 }
836
837 if(junkscan(path))
838 return CURLUE_MALFORMED_INPUT;
839
840 if((flags & CURLU_URLENCODE) && path[0]) {
841 /* worst case output length is 3x the original! */
842 char *newp = malloc(strlen(path) * 3);
843 if(!newp)
844 return CURLUE_OUT_OF_MEMORY;
845 path_alloced = TRUE;
846 strcpy_url(newp, path, TRUE); /* consider it relative */
847 u->temppath = path = newp;
848 }
849
850 fragment = strchr(path, '#');
851 if(fragment) {
852 *fragment++ = 0;
853 if(fragment[0]) {
854 u->fragment = strdup(fragment);
855 if(!u->fragment)
856 return CURLUE_OUT_OF_MEMORY;
857 }
858 }
859
860 query = strchr(path, '?');
861 if(query) {
862 *query++ = 0;
863 /* done even if the query part is a blank string */
864 u->query = strdup(query);
865 if(!u->query)
866 return CURLUE_OUT_OF_MEMORY;
867 }
868
869 if(!path[0])
870 /* if there's no path left set, unset */
871 path = NULL;
872 else {
873 if(!(flags & CURLU_PATH_AS_IS)) {
874 /* remove ../ and ./ sequences according to RFC3986 */
875 char *newp = Curl_dedotdotify(path);
876 if(!newp)
877 return CURLUE_OUT_OF_MEMORY;
878
879 if(strcmp(newp, path)) {
880 /* if we got a new version */
881 if(path_alloced)
882 Curl_safefree(u->temppath);
883 u->temppath = path = newp;
884 path_alloced = TRUE;
885 }
886 else
887 free(newp);
888 }
889
890 u->path = path_alloced?path:strdup(path);
891 if(!u->path)
892 return CURLUE_OUT_OF_MEMORY;
893 u->temppath = NULL; /* used now */
894 }
895
896 if(hostname) {
897 /*
898 * Parse the login details and strip them out of the host name.
899 */
900 if(junkscan(hostname))
901 return CURLUE_MALFORMED_INPUT;
902
903 result = parse_hostname_login(u, &hostname, flags);
904 if(result)
905 return result;
906
907 result = Curl_parse_port(u, hostname);
908 if(result)
909 return result;
910
911 if(0 == strlen(hostname) && (flags & CURLU_NO_AUTHORITY)) {
912 /* Skip hostname check, it's allowed to be empty. */
913 }
914 else {
915 result = hostname_check(u, hostname);
916 if(result)
917 return result;
918 }
919
920 u->host = strdup(hostname);
921 if(!u->host)
922 return CURLUE_OUT_OF_MEMORY;
923
924 if((flags & CURLU_GUESS_SCHEME) && !schemep) {
925 /* legacy curl-style guess based on host name */
926 if(checkprefix("ftp.", hostname))
927 schemep = "ftp";
928 else if(checkprefix("dict.", hostname))
929 schemep = "dict";
930 else if(checkprefix("ldap.", hostname))
931 schemep = "ldap";
932 else if(checkprefix("imap.", hostname))
933 schemep = "imap";
934 else if(checkprefix("smtp.", hostname))
935 schemep = "smtp";
936 else if(checkprefix("pop3.", hostname))
937 schemep = "pop3";
938 else
939 schemep = "http";
940
941 u->scheme = strdup(schemep);
942 if(!u->scheme)
943 return CURLUE_OUT_OF_MEMORY;
944 }
945 }
946
947 Curl_safefree(u->scratch);
948 Curl_safefree(u->temppath);
949
950 return CURLUE_OK;
951 }
952
953 /*
954 * Parse the URL and set the relevant members of the Curl_URL struct.
955 */
parseurl(const char * url,CURLU * u,unsigned int flags)956 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
957 {
958 CURLUcode result = seturl(url, u, flags);
959 if(result) {
960 free_urlhandle(u);
961 memset(u, 0, sizeof(struct Curl_URL));
962 }
963 return result;
964 }
965
966 /*
967 */
curl_url(void)968 CURLU *curl_url(void)
969 {
970 return calloc(sizeof(struct Curl_URL), 1);
971 }
972
curl_url_cleanup(CURLU * u)973 void curl_url_cleanup(CURLU *u)
974 {
975 if(u) {
976 free_urlhandle(u);
977 free(u);
978 }
979 }
980
981 #define DUP(dest, src, name) \
982 if(src->name) { \
983 dest->name = strdup(src->name); \
984 if(!dest->name) \
985 goto fail; \
986 }
987
curl_url_dup(CURLU * in)988 CURLU *curl_url_dup(CURLU *in)
989 {
990 struct Curl_URL *u = calloc(sizeof(struct Curl_URL), 1);
991 if(u) {
992 DUP(u, in, scheme);
993 DUP(u, in, user);
994 DUP(u, in, password);
995 DUP(u, in, options);
996 DUP(u, in, host);
997 DUP(u, in, port);
998 DUP(u, in, path);
999 DUP(u, in, query);
1000 DUP(u, in, fragment);
1001 u->portnum = in->portnum;
1002 }
1003 return u;
1004 fail:
1005 curl_url_cleanup(u);
1006 return NULL;
1007 }
1008
curl_url_get(CURLU * u,CURLUPart what,char ** part,unsigned int flags)1009 CURLUcode curl_url_get(CURLU *u, CURLUPart what,
1010 char **part, unsigned int flags)
1011 {
1012 char *ptr;
1013 CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
1014 char portbuf[7];
1015 bool urldecode = (flags & CURLU_URLDECODE)?1:0;
1016 bool plusdecode = FALSE;
1017 (void)flags;
1018 if(!u)
1019 return CURLUE_BAD_HANDLE;
1020 if(!part)
1021 return CURLUE_BAD_PARTPOINTER;
1022 *part = NULL;
1023
1024 switch(what) {
1025 case CURLUPART_SCHEME:
1026 ptr = u->scheme;
1027 ifmissing = CURLUE_NO_SCHEME;
1028 urldecode = FALSE; /* never for schemes */
1029 break;
1030 case CURLUPART_USER:
1031 ptr = u->user;
1032 ifmissing = CURLUE_NO_USER;
1033 break;
1034 case CURLUPART_PASSWORD:
1035 ptr = u->password;
1036 ifmissing = CURLUE_NO_PASSWORD;
1037 break;
1038 case CURLUPART_OPTIONS:
1039 ptr = u->options;
1040 ifmissing = CURLUE_NO_OPTIONS;
1041 break;
1042 case CURLUPART_HOST:
1043 ptr = u->host;
1044 ifmissing = CURLUE_NO_HOST;
1045 break;
1046 case CURLUPART_ZONEID:
1047 ptr = u->zoneid;
1048 break;
1049 case CURLUPART_PORT:
1050 ptr = u->port;
1051 ifmissing = CURLUE_NO_PORT;
1052 urldecode = FALSE; /* never for port */
1053 if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1054 /* there's no stored port number, but asked to deliver
1055 a default one for the scheme */
1056 const struct Curl_handler *h =
1057 Curl_builtin_scheme(u->scheme);
1058 if(h) {
1059 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1060 ptr = portbuf;
1061 }
1062 }
1063 else if(ptr && u->scheme) {
1064 /* there is a stored port number, but ask to inhibit if
1065 it matches the default one for the scheme */
1066 const struct Curl_handler *h =
1067 Curl_builtin_scheme(u->scheme);
1068 if(h && (h->defport == u->portnum) &&
1069 (flags & CURLU_NO_DEFAULT_PORT))
1070 ptr = NULL;
1071 }
1072 break;
1073 case CURLUPART_PATH:
1074 ptr = u->path;
1075 if(!ptr) {
1076 ptr = u->path = strdup("/");
1077 if(!u->path)
1078 return CURLUE_OUT_OF_MEMORY;
1079 }
1080 break;
1081 case CURLUPART_QUERY:
1082 ptr = u->query;
1083 ifmissing = CURLUE_NO_QUERY;
1084 plusdecode = urldecode;
1085 break;
1086 case CURLUPART_FRAGMENT:
1087 ptr = u->fragment;
1088 ifmissing = CURLUE_NO_FRAGMENT;
1089 break;
1090 case CURLUPART_URL: {
1091 char *url;
1092 char *scheme;
1093 char *options = u->options;
1094 char *port = u->port;
1095 char *allochost = NULL;
1096 if(u->scheme && strcasecompare("file", u->scheme)) {
1097 url = aprintf("file://%s%s%s",
1098 u->path,
1099 u->fragment? "#": "",
1100 u->fragment? u->fragment : "");
1101 }
1102 else if(!u->host)
1103 return CURLUE_NO_HOST;
1104 else {
1105 const struct Curl_handler *h = NULL;
1106 if(u->scheme)
1107 scheme = u->scheme;
1108 else if(flags & CURLU_DEFAULT_SCHEME)
1109 scheme = (char *) DEFAULT_SCHEME;
1110 else
1111 return CURLUE_NO_SCHEME;
1112
1113 h = Curl_builtin_scheme(scheme);
1114 if(!port && (flags & CURLU_DEFAULT_PORT)) {
1115 /* there's no stored port number, but asked to deliver
1116 a default one for the scheme */
1117 if(h) {
1118 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1119 port = portbuf;
1120 }
1121 }
1122 else if(port) {
1123 /* there is a stored port number, but asked to inhibit if it matches
1124 the default one for the scheme */
1125 if(h && (h->defport == u->portnum) &&
1126 (flags & CURLU_NO_DEFAULT_PORT))
1127 port = NULL;
1128 }
1129
1130 if(h && !(h->flags & PROTOPT_URLOPTIONS))
1131 options = NULL;
1132
1133 if((u->host[0] == '[') && u->zoneid) {
1134 /* make it '[ host %25 zoneid ]' */
1135 size_t hostlen = strlen(u->host);
1136 size_t alen = hostlen + 3 + strlen(u->zoneid) + 1;
1137 allochost = malloc(alen);
1138 if(!allochost)
1139 return CURLUE_OUT_OF_MEMORY;
1140 memcpy(allochost, u->host, hostlen - 1);
1141 msnprintf(&allochost[hostlen - 1], alen - hostlen + 1,
1142 "%%25%s]", u->zoneid);
1143 }
1144
1145 url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1146 scheme,
1147 u->user ? u->user : "",
1148 u->password ? ":": "",
1149 u->password ? u->password : "",
1150 options ? ";" : "",
1151 options ? options : "",
1152 (u->user || u->password || options) ? "@": "",
1153 allochost ? allochost : u->host,
1154 port ? ":": "",
1155 port ? port : "",
1156 (u->path && (u->path[0] != '/')) ? "/": "",
1157 u->path ? u->path : "/",
1158 (u->query && u->query[0]) ? "?": "",
1159 (u->query && u->query[0]) ? u->query : "",
1160 u->fragment? "#": "",
1161 u->fragment? u->fragment : "");
1162 free(allochost);
1163 }
1164 if(!url)
1165 return CURLUE_OUT_OF_MEMORY;
1166 *part = url;
1167 return CURLUE_OK;
1168 }
1169 default:
1170 ptr = NULL;
1171 break;
1172 }
1173 if(ptr) {
1174 *part = strdup(ptr);
1175 if(!*part)
1176 return CURLUE_OUT_OF_MEMORY;
1177 if(plusdecode) {
1178 /* convert + to space */
1179 char *plus;
1180 for(plus = *part; *plus; ++plus) {
1181 if(*plus == '+')
1182 *plus = ' ';
1183 }
1184 }
1185 if(urldecode) {
1186 char *decoded;
1187 size_t dlen;
1188 /* this unconditional rejection of control bytes is documented
1189 API behavior */
1190 CURLcode res = Curl_urldecode(NULL, *part, 0, &decoded, &dlen,
1191 REJECT_CTRL);
1192 free(*part);
1193 if(res) {
1194 *part = NULL;
1195 return CURLUE_URLDECODE;
1196 }
1197 *part = decoded;
1198 }
1199 return CURLUE_OK;
1200 }
1201 else
1202 return ifmissing;
1203 }
1204
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1205 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1206 const char *part, unsigned int flags)
1207 {
1208 char **storep = NULL;
1209 long port = 0;
1210 bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
1211 bool plusencode = FALSE;
1212 bool urlskipslash = FALSE;
1213 bool appendquery = FALSE;
1214 bool equalsencode = FALSE;
1215
1216 if(!u)
1217 return CURLUE_BAD_HANDLE;
1218 if(!part) {
1219 /* setting a part to NULL clears it */
1220 switch(what) {
1221 case CURLUPART_URL:
1222 break;
1223 case CURLUPART_SCHEME:
1224 storep = &u->scheme;
1225 break;
1226 case CURLUPART_USER:
1227 storep = &u->user;
1228 break;
1229 case CURLUPART_PASSWORD:
1230 storep = &u->password;
1231 break;
1232 case CURLUPART_OPTIONS:
1233 storep = &u->options;
1234 break;
1235 case CURLUPART_HOST:
1236 storep = &u->host;
1237 break;
1238 case CURLUPART_ZONEID:
1239 storep = &u->zoneid;
1240 break;
1241 case CURLUPART_PORT:
1242 u->portnum = 0;
1243 storep = &u->port;
1244 break;
1245 case CURLUPART_PATH:
1246 storep = &u->path;
1247 break;
1248 case CURLUPART_QUERY:
1249 storep = &u->query;
1250 break;
1251 case CURLUPART_FRAGMENT:
1252 storep = &u->fragment;
1253 break;
1254 default:
1255 return CURLUE_UNKNOWN_PART;
1256 }
1257 if(storep && *storep) {
1258 Curl_safefree(*storep);
1259 }
1260 return CURLUE_OK;
1261 }
1262
1263 switch(what) {
1264 case CURLUPART_SCHEME:
1265 if(strlen(part) > MAX_SCHEME_LEN)
1266 /* too long */
1267 return CURLUE_MALFORMED_INPUT;
1268 if(!(flags & CURLU_NON_SUPPORT_SCHEME) &&
1269 /* verify that it is a fine scheme */
1270 !Curl_builtin_scheme(part))
1271 return CURLUE_UNSUPPORTED_SCHEME;
1272 storep = &u->scheme;
1273 urlencode = FALSE; /* never */
1274 break;
1275 case CURLUPART_USER:
1276 storep = &u->user;
1277 break;
1278 case CURLUPART_PASSWORD:
1279 storep = &u->password;
1280 break;
1281 case CURLUPART_OPTIONS:
1282 storep = &u->options;
1283 break;
1284 case CURLUPART_HOST:
1285 storep = &u->host;
1286 Curl_safefree(u->zoneid);
1287 break;
1288 case CURLUPART_ZONEID:
1289 storep = &u->zoneid;
1290 break;
1291 case CURLUPART_PORT:
1292 {
1293 char *endp;
1294 urlencode = FALSE; /* never */
1295 port = strtol(part, &endp, 10); /* Port number must be decimal */
1296 if((port <= 0) || (port > 0xffff))
1297 return CURLUE_BAD_PORT_NUMBER;
1298 if(*endp)
1299 /* weirdly provided number, not good! */
1300 return CURLUE_MALFORMED_INPUT;
1301 storep = &u->port;
1302 }
1303 break;
1304 case CURLUPART_PATH:
1305 urlskipslash = TRUE;
1306 storep = &u->path;
1307 break;
1308 case CURLUPART_QUERY:
1309 plusencode = urlencode;
1310 appendquery = (flags & CURLU_APPENDQUERY)?1:0;
1311 equalsencode = appendquery;
1312 storep = &u->query;
1313 break;
1314 case CURLUPART_FRAGMENT:
1315 storep = &u->fragment;
1316 break;
1317 case CURLUPART_URL: {
1318 /*
1319 * Allow a new URL to replace the existing (if any) contents.
1320 *
1321 * If the existing contents is enough for a URL, allow a relative URL to
1322 * replace it.
1323 */
1324 CURLUcode result;
1325 char *oldurl;
1326 char *redired_url;
1327 CURLU *handle2;
1328
1329 if(Curl_is_absolute_url(part, NULL, MAX_SCHEME_LEN + 1)) {
1330 handle2 = curl_url();
1331 if(!handle2)
1332 return CURLUE_OUT_OF_MEMORY;
1333 result = parseurl(part, handle2, flags);
1334 if(!result)
1335 mv_urlhandle(handle2, u);
1336 else
1337 curl_url_cleanup(handle2);
1338 return result;
1339 }
1340 /* extract the full "old" URL to do the redirect on */
1341 result = curl_url_get(u, CURLUPART_URL, &oldurl, flags);
1342 if(result) {
1343 /* couldn't get the old URL, just use the new! */
1344 handle2 = curl_url();
1345 if(!handle2)
1346 return CURLUE_OUT_OF_MEMORY;
1347 result = parseurl(part, handle2, flags);
1348 if(!result)
1349 mv_urlhandle(handle2, u);
1350 else
1351 curl_url_cleanup(handle2);
1352 return result;
1353 }
1354
1355 /* apply the relative part to create a new URL */
1356 redired_url = concat_url(oldurl, part);
1357 free(oldurl);
1358 if(!redired_url)
1359 return CURLUE_OUT_OF_MEMORY;
1360
1361 /* now parse the new URL */
1362 handle2 = curl_url();
1363 if(!handle2) {
1364 free(redired_url);
1365 return CURLUE_OUT_OF_MEMORY;
1366 }
1367 result = parseurl(redired_url, handle2, flags);
1368 free(redired_url);
1369 if(!result)
1370 mv_urlhandle(handle2, u);
1371 else
1372 curl_url_cleanup(handle2);
1373 return result;
1374 }
1375 default:
1376 return CURLUE_UNKNOWN_PART;
1377 }
1378 DEBUGASSERT(storep);
1379 {
1380 const char *newp = part;
1381 size_t nalloc = strlen(part);
1382
1383 if(nalloc > CURL_MAX_INPUT_LENGTH)
1384 /* excessive input length */
1385 return CURLUE_MALFORMED_INPUT;
1386
1387 if(urlencode) {
1388 const unsigned char *i;
1389 char *o;
1390 bool free_part = FALSE;
1391 char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
1392 if(!enc)
1393 return CURLUE_OUT_OF_MEMORY;
1394 if(plusencode) {
1395 /* space to plus */
1396 i = (const unsigned char *)part;
1397 for(o = enc; *i; ++o, ++i)
1398 *o = (*i == ' ') ? '+' : *i;
1399 *o = 0; /* null-terminate */
1400 part = strdup(enc);
1401 if(!part) {
1402 free(enc);
1403 return CURLUE_OUT_OF_MEMORY;
1404 }
1405 free_part = TRUE;
1406 }
1407 for(i = (const unsigned char *)part, o = enc; *i; i++) {
1408 if(Curl_isunreserved(*i) ||
1409 ((*i == '/') && urlskipslash) ||
1410 ((*i == '=') && equalsencode) ||
1411 ((*i == '+') && plusencode)) {
1412 if((*i == '=') && equalsencode)
1413 /* only skip the first equals sign */
1414 equalsencode = FALSE;
1415 *o = *i;
1416 o++;
1417 }
1418 else {
1419 msnprintf(o, 4, "%%%02x", *i);
1420 o += 3;
1421 }
1422 }
1423 *o = 0; /* null-terminate */
1424 newp = enc;
1425 if(free_part)
1426 free((char *)part);
1427 }
1428 else {
1429 char *p;
1430 newp = strdup(part);
1431 if(!newp)
1432 return CURLUE_OUT_OF_MEMORY;
1433 p = (char *)newp;
1434 while(*p) {
1435 /* make sure percent encoded are lower case */
1436 if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1437 (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1438 p[1] = (char)TOLOWER(p[1]);
1439 p[2] = (char)TOLOWER(p[2]);
1440 p += 3;
1441 }
1442 else
1443 p++;
1444 }
1445 }
1446
1447 if(appendquery) {
1448 /* Append the string onto the old query. Add a '&' separator if none is
1449 present at the end of the exsting query already */
1450 size_t querylen = u->query ? strlen(u->query) : 0;
1451 bool addamperand = querylen && (u->query[querylen -1] != '&');
1452 if(querylen) {
1453 size_t newplen = strlen(newp);
1454 char *p = malloc(querylen + addamperand + newplen + 1);
1455 if(!p) {
1456 free((char *)newp);
1457 return CURLUE_OUT_OF_MEMORY;
1458 }
1459 strcpy(p, u->query); /* original query */
1460 if(addamperand)
1461 p[querylen] = '&'; /* ampersand */
1462 strcpy(&p[querylen + addamperand], newp); /* new suffix */
1463 free((char *)newp);
1464 free(*storep);
1465 *storep = p;
1466 return CURLUE_OK;
1467 }
1468 }
1469
1470 if(what == CURLUPART_HOST) {
1471 if(0 == strlen(newp) && (flags & CURLU_NO_AUTHORITY)) {
1472 /* Skip hostname check, it's allowed to be empty. */
1473 }
1474 else {
1475 if(hostname_check(u, (char *)newp)) {
1476 free((char *)newp);
1477 return CURLUE_MALFORMED_INPUT;
1478 }
1479 }
1480 }
1481
1482 free(*storep);
1483 *storep = (char *)newp;
1484 }
1485 /* set after the string, to make it not assigned if the allocation above
1486 fails */
1487 if(port)
1488 u->portnum = port;
1489 return CURLUE_OK;
1490 }
1491