1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.haxx.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22
23 #include "curl_setup.h"
24
25 #include "urldata.h"
26 #include "urlapi-int.h"
27 #include "strcase.h"
28 #include "dotdot.h"
29 #include "url.h"
30 #include "escape.h"
31 #include "curl_ctype.h"
32
33 /* The last 3 #include files should be in this order */
34 #include "curl_printf.h"
35 #include "curl_memory.h"
36 #include "memdebug.h"
37
38 /* MSDOS/Windows style drive prefix, eg c: in c:foo */
39 #define STARTS_WITH_DRIVE_PREFIX(str) \
40 ((('a' <= str[0] && str[0] <= 'z') || \
41 ('A' <= str[0] && str[0] <= 'Z')) && \
42 (str[1] == ':'))
43
44 /* MSDOS/Windows style drive prefix, optionally with
45 * a '|' instead of ':', followed by a slash or NUL */
46 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
47 ((('a' <= (str)[0] && (str)[0] <= 'z') || \
48 ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
49 ((str)[1] == ':' || (str)[1] == '|') && \
50 ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
51
52 /* Internal representation of CURLU. Point to URL-encoded strings. */
53 struct Curl_URL {
54 char *scheme;
55 char *user;
56 char *password;
57 char *options; /* IMAP only? */
58 char *host;
59 char *port;
60 char *path;
61 char *query;
62 char *fragment;
63
64 char *scratch; /* temporary scratch area */
65 long portnum; /* the numerical version */
66 };
67
68 #define DEFAULT_SCHEME "https"
69
free_urlhandle(struct Curl_URL * u)70 static void free_urlhandle(struct Curl_URL *u)
71 {
72 free(u->scheme);
73 free(u->user);
74 free(u->password);
75 free(u->options);
76 free(u->host);
77 free(u->port);
78 free(u->path);
79 free(u->query);
80 free(u->fragment);
81 free(u->scratch);
82 }
83
84 /* move the full contents of one handle onto another and
85 free the original */
mv_urlhandle(struct Curl_URL * from,struct Curl_URL * to)86 static void mv_urlhandle(struct Curl_URL *from,
87 struct Curl_URL *to)
88 {
89 free_urlhandle(to);
90 *to = *from;
91 free(from);
92 }
93
94 /*
95 * Find the separator at the end of the host name, or the '?' in cases like
96 * http://www.url.com?id=2380
97 */
find_host_sep(const char * url)98 static const char *find_host_sep(const char *url)
99 {
100 const char *sep;
101 const char *query;
102
103 /* Find the start of the hostname */
104 sep = strstr(url, "//");
105 if(!sep)
106 sep = url;
107 else
108 sep += 2;
109
110 query = strchr(sep, '?');
111 sep = strchr(sep, '/');
112
113 if(!sep)
114 sep = url + strlen(url);
115
116 if(!query)
117 query = url + strlen(url);
118
119 return sep < query ? sep : query;
120 }
121
122 /*
123 * Decide in an encoding-independent manner whether a character in an
124 * URL must be escaped. The same criterion must be used in strlen_url()
125 * and strcpy_url().
126 */
urlchar_needs_escaping(int c)127 static bool urlchar_needs_escaping(int c)
128 {
129 return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
130 }
131
132 /*
133 * strlen_url() returns the length of the given URL if the spaces within the
134 * URL were properly URL encoded.
135 * URL encoding should be skipped for host names, otherwise IDN resolution
136 * will fail.
137 */
strlen_url(const char * url,bool relative)138 static size_t strlen_url(const char *url, bool relative)
139 {
140 const unsigned char *ptr;
141 size_t newlen = 0;
142 bool left = TRUE; /* left side of the ? */
143 const unsigned char *host_sep = (const unsigned char *) url;
144
145 if(!relative)
146 host_sep = (const unsigned char *) find_host_sep(url);
147
148 for(ptr = (unsigned char *)url; *ptr; ptr++) {
149
150 if(ptr < host_sep) {
151 ++newlen;
152 continue;
153 }
154
155 switch(*ptr) {
156 case '?':
157 left = FALSE;
158 /* FALLTHROUGH */
159 default:
160 if(urlchar_needs_escaping(*ptr))
161 newlen += 2;
162 newlen++;
163 break;
164 case ' ':
165 if(left)
166 newlen += 3;
167 else
168 newlen++;
169 break;
170 }
171 }
172 return newlen;
173 }
174
175 /* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
176 * the source URL accordingly.
177 * URL encoding should be skipped for host names, otherwise IDN resolution
178 * will fail.
179 */
strcpy_url(char * output,const char * url,bool relative)180 static void strcpy_url(char *output, const char *url, bool relative)
181 {
182 /* we must add this with whitespace-replacing */
183 bool left = TRUE;
184 const unsigned char *iptr;
185 char *optr = output;
186 const unsigned char *host_sep = (const unsigned char *) url;
187
188 if(!relative)
189 host_sep = (const unsigned char *) find_host_sep(url);
190
191 for(iptr = (unsigned char *)url; /* read from here */
192 *iptr; /* until zero byte */
193 iptr++) {
194
195 if(iptr < host_sep) {
196 *optr++ = *iptr;
197 continue;
198 }
199
200 switch(*iptr) {
201 case '?':
202 left = FALSE;
203 /* FALLTHROUGH */
204 default:
205 if(urlchar_needs_escaping(*iptr)) {
206 msnprintf(optr, 4, "%%%02x", *iptr);
207 optr += 3;
208 }
209 else
210 *optr++=*iptr;
211 break;
212 case ' ':
213 if(left) {
214 *optr++='%'; /* add a '%' */
215 *optr++='2'; /* add a '2' */
216 *optr++='0'; /* add a '0' */
217 }
218 else
219 *optr++='+'; /* add a '+' here */
220 break;
221 }
222 }
223 *optr = 0; /* zero terminate output buffer */
224
225 }
226
227 /*
228 * Returns true if the given URL is absolute (as opposed to relative) within
229 * the buffer size. Returns the scheme in the buffer if TRUE and 'buf' is
230 * non-NULL.
231 */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen)232 bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
233 {
234 size_t i;
235 #ifdef WIN32
236 if(STARTS_WITH_DRIVE_PREFIX(url))
237 return FALSE;
238 #endif
239 for(i = 0; i < buflen && url[i]; ++i) {
240 char s = url[i];
241 if((s == ':') && (url[i + 1] == '/')) {
242 if(buf)
243 buf[i] = 0;
244 return TRUE;
245 }
246 /* RFC 3986 3.1 explains:
247 scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
248 */
249 else if(ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') ) {
250 if(buf)
251 buf[i] = (char)TOLOWER(s);
252 }
253 else
254 break;
255 }
256 return FALSE;
257 }
258
259 /*
260 * Concatenate a relative URL to a base URL making it absolute.
261 * URL-encodes any spaces.
262 * The returned pointer must be freed by the caller unless NULL
263 * (returns NULL on out of memory).
264 */
concat_url(const char * base,const char * relurl)265 static char *concat_url(const char *base, const char *relurl)
266 {
267 /***
268 TRY to append this new path to the old URL
269 to the right of the host part. Oh crap, this is doomed to cause
270 problems in the future...
271 */
272 char *newest;
273 char *protsep;
274 char *pathsep;
275 size_t newlen;
276 bool host_changed = FALSE;
277
278 const char *useurl = relurl;
279 size_t urllen;
280
281 /* we must make our own copy of the URL to play with, as it may
282 point to read-only data */
283 char *url_clone = strdup(base);
284
285 if(!url_clone)
286 return NULL; /* skip out of this NOW */
287
288 /* protsep points to the start of the host name */
289 protsep = strstr(url_clone, "//");
290 if(!protsep)
291 protsep = url_clone;
292 else
293 protsep += 2; /* pass the slashes */
294
295 if('/' != relurl[0]) {
296 int level = 0;
297
298 /* First we need to find out if there's a ?-letter in the URL,
299 and cut it and the right-side of that off */
300 pathsep = strchr(protsep, '?');
301 if(pathsep)
302 *pathsep = 0;
303
304 /* we have a relative path to append to the last slash if there's one
305 available, or if the new URL is just a query string (starts with a
306 '?') we append the new one at the end of the entire currently worked
307 out URL */
308 if(useurl[0] != '?') {
309 pathsep = strrchr(protsep, '/');
310 if(pathsep)
311 *pathsep = 0;
312 }
313
314 /* Check if there's any slash after the host name, and if so, remember
315 that position instead */
316 pathsep = strchr(protsep, '/');
317 if(pathsep)
318 protsep = pathsep + 1;
319 else
320 protsep = NULL;
321
322 /* now deal with one "./" or any amount of "../" in the newurl
323 and act accordingly */
324
325 if((useurl[0] == '.') && (useurl[1] == '/'))
326 useurl += 2; /* just skip the "./" */
327
328 while((useurl[0] == '.') &&
329 (useurl[1] == '.') &&
330 (useurl[2] == '/')) {
331 level++;
332 useurl += 3; /* pass the "../" */
333 }
334
335 if(protsep) {
336 while(level--) {
337 /* cut off one more level from the right of the original URL */
338 pathsep = strrchr(protsep, '/');
339 if(pathsep)
340 *pathsep = 0;
341 else {
342 *protsep = 0;
343 break;
344 }
345 }
346 }
347 }
348 else {
349 /* We got a new absolute path for this server */
350
351 if((relurl[0] == '/') && (relurl[1] == '/')) {
352 /* the new URL starts with //, just keep the protocol part from the
353 original one */
354 *protsep = 0;
355 useurl = &relurl[2]; /* we keep the slashes from the original, so we
356 skip the new ones */
357 host_changed = TRUE;
358 }
359 else {
360 /* cut off the original URL from the first slash, or deal with URLs
361 without slash */
362 pathsep = strchr(protsep, '/');
363 if(pathsep) {
364 /* When people use badly formatted URLs, such as
365 "http://www.url.com?dir=/home/daniel" we must not use the first
366 slash, if there's a ?-letter before it! */
367 char *sep = strchr(protsep, '?');
368 if(sep && (sep < pathsep))
369 pathsep = sep;
370 *pathsep = 0;
371 }
372 else {
373 /* There was no slash. Now, since we might be operating on a badly
374 formatted URL, such as "http://www.url.com?id=2380" which doesn't
375 use a slash separator as it is supposed to, we need to check for a
376 ?-letter as well! */
377 pathsep = strchr(protsep, '?');
378 if(pathsep)
379 *pathsep = 0;
380 }
381 }
382 }
383
384 /* If the new part contains a space, this is a mighty stupid redirect
385 but we still make an effort to do "right". To the left of a '?'
386 letter we replace each space with %20 while it is replaced with '+'
387 on the right side of the '?' letter.
388 */
389 newlen = strlen_url(useurl, !host_changed);
390
391 urllen = strlen(url_clone);
392
393 newest = malloc(urllen + 1 + /* possible slash */
394 newlen + 1 /* zero byte */);
395
396 if(!newest) {
397 free(url_clone); /* don't leak this */
398 return NULL;
399 }
400
401 /* copy over the root url part */
402 memcpy(newest, url_clone, urllen);
403
404 /* check if we need to append a slash */
405 if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
406 ;
407 else
408 newest[urllen++]='/';
409
410 /* then append the new piece on the right side */
411 strcpy_url(&newest[urllen], useurl, !host_changed);
412
413 free(url_clone);
414
415 return newest;
416 }
417
418 /*
419 * parse_hostname_login()
420 *
421 * Parse the login details (user name, password and options) from the URL and
422 * strip them out of the host name
423 *
424 */
parse_hostname_login(struct Curl_URL * u,const struct Curl_handler * h,char ** hostname,unsigned int flags)425 static CURLUcode parse_hostname_login(struct Curl_URL *u,
426 const struct Curl_handler *h,
427 char **hostname,
428 unsigned int flags)
429 {
430 CURLUcode result = CURLUE_OK;
431 CURLcode ccode;
432 char *userp = NULL;
433 char *passwdp = NULL;
434 char *optionsp = NULL;
435
436 /* At this point, we're hoping all the other special cases have
437 * been taken care of, so conn->host.name is at most
438 * [user[:password][;options]]@]hostname
439 *
440 * We need somewhere to put the embedded details, so do that first.
441 */
442
443 char *ptr = strchr(*hostname, '@');
444 char *login = *hostname;
445
446 if(!ptr)
447 goto out;
448
449 /* We will now try to extract the
450 * possible login information in a string like:
451 * ftp://user:password@ftp.my.site:8021/README */
452 *hostname = ++ptr;
453
454 /* We could use the login information in the URL so extract it. Only parse
455 options if the handler says we should. Note that 'h' might be NULL! */
456 ccode = Curl_parse_login_details(login, ptr - login - 1,
457 &userp, &passwdp,
458 (h && (h->flags & PROTOPT_URLOPTIONS)) ?
459 &optionsp:NULL);
460 if(ccode) {
461 result = CURLUE_MALFORMED_INPUT;
462 goto out;
463 }
464
465 if(userp) {
466 if(flags & CURLU_DISALLOW_USER) {
467 /* Option DISALLOW_USER is set and url contains username. */
468 result = CURLUE_USER_NOT_ALLOWED;
469 goto out;
470 }
471
472 u->user = userp;
473 }
474
475 if(passwdp)
476 u->password = passwdp;
477
478 if(optionsp)
479 u->options = optionsp;
480
481 return CURLUE_OK;
482 out:
483
484 free(userp);
485 free(passwdp);
486 free(optionsp);
487
488 return result;
489 }
490
Curl_parse_port(struct Curl_URL * u,char * hostname)491 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname)
492 {
493 char *portptr = NULL;
494 char endbracket;
495 int len;
496
497 /*
498 * Find the end of an IPv6 address, either on the ']' ending bracket or
499 * a percent-encoded zone index.
500 */
501 if(1 == sscanf(hostname, "[%*45[0123456789abcdefABCDEF:.]%c%n",
502 &endbracket, &len)) {
503 if(']' == endbracket)
504 portptr = &hostname[len];
505 else if('%' == endbracket) {
506 int zonelen = len;
507 if(1 == sscanf(hostname + zonelen, "25%*[^]]%c%n", &endbracket, &len)) {
508 if(']' != endbracket)
509 return CURLUE_MALFORMED_INPUT;
510 portptr = &hostname[--zonelen + len + 1];
511 }
512 else
513 return CURLUE_MALFORMED_INPUT;
514 }
515 else
516 return CURLUE_MALFORMED_INPUT;
517
518 /* this is a RFC2732-style specified IP-address */
519 if(portptr && *portptr) {
520 if(*portptr != ':')
521 return CURLUE_MALFORMED_INPUT;
522 }
523 else
524 portptr = NULL;
525 }
526 else
527 portptr = strchr(hostname, ':');
528
529 if(portptr) {
530 char *rest;
531 long port;
532 char portbuf[7];
533
534 /* Browser behavior adaptation. If there's a colon with no digits after,
535 just cut off the name there which makes us ignore the colon and just
536 use the default port. Firefox, Chrome and Safari all do that. */
537 if(!portptr[1]) {
538 *portptr = '\0';
539 return CURLUE_OK;
540 }
541
542 if(!ISDIGIT(portptr[1]))
543 return CURLUE_BAD_PORT_NUMBER;
544
545 port = strtol(portptr + 1, &rest, 10); /* Port number must be decimal */
546
547 if((port <= 0) || (port > 0xffff))
548 /* Single unix standard says port numbers are 16 bits long, but we don't
549 treat port zero as OK. */
550 return CURLUE_BAD_PORT_NUMBER;
551
552 if(rest[0])
553 return CURLUE_BAD_PORT_NUMBER;
554
555 *portptr++ = '\0'; /* cut off the name there */
556 *rest = 0;
557 /* generate a new port number string to get rid of leading zeroes etc */
558 msnprintf(portbuf, sizeof(portbuf), "%ld", port);
559 u->portnum = port;
560 u->port = strdup(portbuf);
561 if(!u->port)
562 return CURLUE_OUT_OF_MEMORY;
563 }
564
565 return CURLUE_OK;
566 }
567
568 /* scan for byte values < 31 or 127 */
junkscan(char * part)569 static CURLUcode junkscan(char *part)
570 {
571 if(part) {
572 static const char badbytes[]={
573 /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
574 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
575 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
576 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
577 0x7f,
578 0x00 /* zero terminate */
579 };
580 size_t n = strlen(part);
581 size_t nfine = strcspn(part, badbytes);
582 if(nfine != n)
583 /* since we don't know which part is scanned, return a generic error
584 code */
585 return CURLUE_MALFORMED_INPUT;
586 }
587 return CURLUE_OK;
588 }
589
hostname_check(char * hostname,unsigned int flags)590 static CURLUcode hostname_check(char *hostname, unsigned int flags)
591 {
592 const char *l = NULL; /* accepted characters */
593 size_t len;
594 size_t hlen = strlen(hostname);
595 (void)flags;
596
597 if(hostname[0] == '[') {
598 hostname++;
599 l = "0123456789abcdefABCDEF::.%";
600 hlen -= 2;
601 }
602
603 if(l) {
604 /* only valid letters are ok */
605 len = strspn(hostname, l);
606 if(hlen != len)
607 /* hostname with bad content */
608 return CURLUE_MALFORMED_INPUT;
609 }
610 else {
611 /* letters from the second string is not ok */
612 len = strcspn(hostname, " ");
613 if(hlen != len)
614 /* hostname with bad content */
615 return CURLUE_MALFORMED_INPUT;
616 }
617 return CURLUE_OK;
618 }
619
620 #define HOSTNAME_END(x) (((x) == '/') || ((x) == '?') || ((x) == '#'))
621
seturl(const char * url,CURLU * u,unsigned int flags)622 static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
623 {
624 char *path;
625 bool path_alloced = FALSE;
626 char *hostname;
627 char *query = NULL;
628 char *fragment = NULL;
629 CURLUcode result;
630 bool url_has_scheme = FALSE;
631 char schemebuf[MAX_SCHEME_LEN];
632 char *schemep = NULL;
633 size_t schemelen = 0;
634 size_t urllen;
635 const struct Curl_handler *h = NULL;
636
637 if(!url)
638 return CURLUE_MALFORMED_INPUT;
639
640 /*************************************************************
641 * Parse the URL.
642 ************************************************************/
643 /* allocate scratch area */
644 urllen = strlen(url);
645 path = u->scratch = malloc(urllen * 2 + 2);
646 if(!path)
647 return CURLUE_OUT_OF_MEMORY;
648
649 hostname = &path[urllen + 1];
650 hostname[0] = 0;
651
652 if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
653 url_has_scheme = TRUE;
654 schemelen = strlen(schemebuf);
655 }
656
657 /* handle the file: scheme */
658 if(url_has_scheme && strcasecompare(schemebuf, "file")) {
659 /* path has been allocated large enough to hold this */
660 strcpy(path, &url[5]);
661
662 hostname = NULL; /* no host for file: URLs */
663 u->scheme = strdup("file");
664 if(!u->scheme)
665 return CURLUE_OUT_OF_MEMORY;
666
667 /* Extra handling URLs with an authority component (i.e. that start with
668 * "file://")
669 *
670 * We allow omitted hostname (e.g. file:/<path>) -- valid according to
671 * RFC 8089, but not the (current) WHAT-WG URL spec.
672 */
673 if(path[0] == '/' && path[1] == '/') {
674 /* swallow the two slashes */
675 char *ptr = &path[2];
676
677 /*
678 * According to RFC 8089, a file: URL can be reliably dereferenced if:
679 *
680 * o it has no/blank hostname, or
681 *
682 * o the hostname matches "localhost" (case-insensitively), or
683 *
684 * o the hostname is a FQDN that resolves to this machine.
685 *
686 * For brevity, we only consider URLs with empty, "localhost", or
687 * "127.0.0.1" hostnames as local.
688 *
689 * Additionally, there is an exception for URLs with a Windows drive
690 * letter in the authority (which was accidentally omitted from RFC 8089
691 * Appendix E, but believe me, it was meant to be there. --MK)
692 */
693 if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
694 /* the URL includes a host name, it must match "localhost" or
695 "127.0.0.1" to be valid */
696 if(!checkprefix("localhost/", ptr) &&
697 !checkprefix("127.0.0.1/", ptr)) {
698 /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
699 none */
700 return CURLUE_MALFORMED_INPUT;
701 }
702 ptr += 9; /* now points to the slash after the host */
703 }
704
705 path = ptr;
706 }
707
708 #if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
709 /* Don't allow Windows drive letters when not in Windows.
710 * This catches both "file:/c:" and "file:c:" */
711 if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
712 STARTS_WITH_URL_DRIVE_PREFIX(path)) {
713 /* File drive letters are only accepted in MSDOS/Windows */
714 return CURLUE_MALFORMED_INPUT;
715 }
716 #else
717 /* If the path starts with a slash and a drive letter, ditch the slash */
718 if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
719 /* This cannot be done with strcpy, as the memory chunks overlap! */
720 memmove(path, &path[1], strlen(&path[1]) + 1);
721 }
722 #endif
723
724 }
725 else {
726 /* clear path */
727 const char *p;
728 const char *hostp;
729 size_t len;
730 path[0] = 0;
731
732 if(url_has_scheme) {
733 int i = 0;
734 p = &url[schemelen + 1];
735 while(p && (*p == '/') && (i < 4)) {
736 p++;
737 i++;
738 }
739 if((i < 1) || (i>3))
740 /* less than one or more than three slashes */
741 return CURLUE_MALFORMED_INPUT;
742
743 schemep = schemebuf;
744 if(!Curl_builtin_scheme(schemep) &&
745 !(flags & CURLU_NON_SUPPORT_SCHEME))
746 return CURLUE_UNSUPPORTED_SCHEME;
747
748 if(junkscan(schemep))
749 return CURLUE_MALFORMED_INPUT;
750 }
751 else {
752 /* no scheme! */
753
754 if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
755 return CURLUE_MALFORMED_INPUT;
756 if(flags & CURLU_DEFAULT_SCHEME)
757 schemep = (char *) DEFAULT_SCHEME;
758
759 /*
760 * The URL was badly formatted, let's try without scheme specified.
761 */
762 p = url;
763 }
764 hostp = p; /* host name starts here */
765
766 while(*p && !HOSTNAME_END(*p)) /* find end of host name */
767 p++;
768
769 len = p - hostp;
770 if(!len)
771 return CURLUE_MALFORMED_INPUT;
772
773 memcpy(hostname, hostp, len);
774 hostname[len] = 0;
775
776 if((flags & CURLU_GUESS_SCHEME) && !schemep) {
777 /* legacy curl-style guess based on host name */
778 if(checkprefix("ftp.", hostname))
779 schemep = (char *)"ftp";
780 else if(checkprefix("dict.", hostname))
781 schemep = (char *)"dict";
782 else if(checkprefix("ldap.", hostname))
783 schemep = (char *)"ldap";
784 else if(checkprefix("imap.", hostname))
785 schemep = (char *)"imap";
786 else if(checkprefix("smtp.", hostname))
787 schemep = (char *)"smtp";
788 else if(checkprefix("pop3.", hostname))
789 schemep = (char *)"pop3";
790 else
791 schemep = (char *)"http";
792 }
793
794 len = strlen(p);
795 memcpy(path, p, len);
796 path[len] = 0;
797
798 u->scheme = strdup(schemep);
799 if(!u->scheme)
800 return CURLUE_OUT_OF_MEMORY;
801 }
802
803 /* if this is a known scheme, get some details */
804 h = Curl_builtin_scheme(u->scheme);
805
806 if(junkscan(path))
807 return CURLUE_MALFORMED_INPUT;
808
809 query = strchr(path, '?');
810 if(query)
811 *query++ = 0;
812
813 fragment = strchr(query?query:path, '#');
814 if(fragment)
815 *fragment++ = 0;
816
817 if(!path[0])
818 /* if there's no path set, unset */
819 path = NULL;
820 else if(!(flags & CURLU_PATH_AS_IS)) {
821 /* sanitise paths and remove ../ and ./ sequences according to RFC3986 */
822 char *newp = Curl_dedotdotify(path);
823 if(!newp)
824 return CURLUE_OUT_OF_MEMORY;
825
826 if(strcmp(newp, path)) {
827 /* if we got a new version */
828 path = newp;
829 path_alloced = TRUE;
830 }
831 else
832 free(newp);
833 }
834 if(path) {
835 u->path = path_alloced?path:strdup(path);
836 if(!u->path)
837 return CURLUE_OUT_OF_MEMORY;
838 }
839
840 if(hostname) {
841 /*
842 * Parse the login details and strip them out of the host name.
843 */
844 if(junkscan(hostname))
845 return CURLUE_MALFORMED_INPUT;
846
847 result = parse_hostname_login(u, h, &hostname, flags);
848 if(result)
849 return result;
850
851 result = Curl_parse_port(u, hostname);
852 if(result)
853 return result;
854
855 result = hostname_check(hostname, flags);
856 if(result)
857 return result;
858
859 u->host = strdup(hostname);
860 if(!u->host)
861 return CURLUE_OUT_OF_MEMORY;
862 }
863
864 if(query) {
865 u->query = strdup(query);
866 if(!u->query)
867 return CURLUE_OUT_OF_MEMORY;
868 }
869 if(fragment && fragment[0]) {
870 u->fragment = strdup(fragment);
871 if(!u->fragment)
872 return CURLUE_OUT_OF_MEMORY;
873 }
874
875 free(u->scratch);
876 u->scratch = NULL;
877
878 return CURLUE_OK;
879 }
880
881 /*
882 * Parse the URL and set the relevant members of the Curl_URL struct.
883 */
parseurl(const char * url,CURLU * u,unsigned int flags)884 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
885 {
886 CURLUcode result = seturl(url, u, flags);
887 if(result) {
888 free_urlhandle(u);
889 memset(u, 0, sizeof(struct Curl_URL));
890 }
891 return result;
892 }
893
894 /*
895 */
curl_url(void)896 CURLU *curl_url(void)
897 {
898 return calloc(sizeof(struct Curl_URL), 1);
899 }
900
curl_url_cleanup(CURLU * u)901 void curl_url_cleanup(CURLU *u)
902 {
903 if(u) {
904 free_urlhandle(u);
905 free(u);
906 }
907 }
908
909 #define DUP(dest, src, name) \
910 if(src->name) { \
911 dest->name = strdup(src->name); \
912 if(!dest->name) \
913 goto fail; \
914 }
915
curl_url_dup(CURLU * in)916 CURLU *curl_url_dup(CURLU *in)
917 {
918 struct Curl_URL *u = calloc(sizeof(struct Curl_URL), 1);
919 if(u) {
920 DUP(u, in, scheme);
921 DUP(u, in, user);
922 DUP(u, in, password);
923 DUP(u, in, options);
924 DUP(u, in, host);
925 DUP(u, in, port);
926 DUP(u, in, path);
927 DUP(u, in, query);
928 DUP(u, in, fragment);
929 u->portnum = in->portnum;
930 }
931 return u;
932 fail:
933 curl_url_cleanup(u);
934 return NULL;
935 }
936
curl_url_get(CURLU * u,CURLUPart what,char ** part,unsigned int flags)937 CURLUcode curl_url_get(CURLU *u, CURLUPart what,
938 char **part, unsigned int flags)
939 {
940 char *ptr;
941 CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
942 char portbuf[7];
943 bool urldecode = (flags & CURLU_URLDECODE)?1:0;
944 bool plusdecode = FALSE;
945 (void)flags;
946 if(!u)
947 return CURLUE_BAD_HANDLE;
948 if(!part)
949 return CURLUE_BAD_PARTPOINTER;
950 *part = NULL;
951
952 switch(what) {
953 case CURLUPART_SCHEME:
954 ptr = u->scheme;
955 ifmissing = CURLUE_NO_SCHEME;
956 urldecode = FALSE; /* never for schemes */
957 break;
958 case CURLUPART_USER:
959 ptr = u->user;
960 ifmissing = CURLUE_NO_USER;
961 break;
962 case CURLUPART_PASSWORD:
963 ptr = u->password;
964 ifmissing = CURLUE_NO_PASSWORD;
965 break;
966 case CURLUPART_OPTIONS:
967 ptr = u->options;
968 ifmissing = CURLUE_NO_OPTIONS;
969 break;
970 case CURLUPART_HOST:
971 ptr = u->host;
972 ifmissing = CURLUE_NO_HOST;
973 break;
974 case CURLUPART_PORT:
975 ptr = u->port;
976 ifmissing = CURLUE_NO_PORT;
977 urldecode = FALSE; /* never for port */
978 if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
979 /* there's no stored port number, but asked to deliver
980 a default one for the scheme */
981 const struct Curl_handler *h =
982 Curl_builtin_scheme(u->scheme);
983 if(h) {
984 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
985 ptr = portbuf;
986 }
987 }
988 else if(ptr && u->scheme) {
989 /* there is a stored port number, but ask to inhibit if
990 it matches the default one for the scheme */
991 const struct Curl_handler *h =
992 Curl_builtin_scheme(u->scheme);
993 if(h && (h->defport == u->portnum) &&
994 (flags & CURLU_NO_DEFAULT_PORT))
995 ptr = NULL;
996 }
997 break;
998 case CURLUPART_PATH:
999 ptr = u->path;
1000 if(!ptr) {
1001 ptr = u->path = strdup("/");
1002 if(!u->path)
1003 return CURLUE_OUT_OF_MEMORY;
1004 }
1005 break;
1006 case CURLUPART_QUERY:
1007 ptr = u->query;
1008 ifmissing = CURLUE_NO_QUERY;
1009 plusdecode = urldecode;
1010 break;
1011 case CURLUPART_FRAGMENT:
1012 ptr = u->fragment;
1013 ifmissing = CURLUE_NO_FRAGMENT;
1014 break;
1015 case CURLUPART_URL: {
1016 char *url;
1017 char *scheme;
1018 char *options = u->options;
1019 char *port = u->port;
1020 if(u->scheme && strcasecompare("file", u->scheme)) {
1021 url = aprintf("file://%s%s%s",
1022 u->path,
1023 u->fragment? "#": "",
1024 u->fragment? u->fragment : "");
1025 }
1026 else if(!u->host)
1027 return CURLUE_NO_HOST;
1028 else {
1029 const struct Curl_handler *h = NULL;
1030 if(u->scheme)
1031 scheme = u->scheme;
1032 else if(flags & CURLU_DEFAULT_SCHEME)
1033 scheme = (char *) DEFAULT_SCHEME;
1034 else
1035 return CURLUE_NO_SCHEME;
1036
1037 if(scheme) {
1038 h = Curl_builtin_scheme(scheme);
1039 if(!port && (flags & CURLU_DEFAULT_PORT)) {
1040 /* there's no stored port number, but asked to deliver
1041 a default one for the scheme */
1042 if(h) {
1043 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1044 port = portbuf;
1045 }
1046 }
1047 else if(port) {
1048 /* there is a stored port number, but asked to inhibit if it matches
1049 the default one for the scheme */
1050 if(h && (h->defport == u->portnum) &&
1051 (flags & CURLU_NO_DEFAULT_PORT))
1052 port = NULL;
1053 }
1054 }
1055 if(h && !(h->flags & PROTOPT_URLOPTIONS))
1056 options = NULL;
1057
1058 url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1059 scheme,
1060 u->user ? u->user : "",
1061 u->password ? ":": "",
1062 u->password ? u->password : "",
1063 options ? ";" : "",
1064 options ? options : "",
1065 (u->user || u->password || options) ? "@": "",
1066 u->host,
1067 port ? ":": "",
1068 port ? port : "",
1069 (u->path && (u->path[0] != '/')) ? "/": "",
1070 u->path ? u->path : "/",
1071 (u->query && u->query[0]) ? "?": "",
1072 (u->query && u->query[0]) ? u->query : "",
1073 u->fragment? "#": "",
1074 u->fragment? u->fragment : "");
1075 }
1076 if(!url)
1077 return CURLUE_OUT_OF_MEMORY;
1078 *part = url;
1079 return CURLUE_OK;
1080 }
1081 default:
1082 ptr = NULL;
1083 break;
1084 }
1085 if(ptr) {
1086 *part = strdup(ptr);
1087 if(!*part)
1088 return CURLUE_OUT_OF_MEMORY;
1089 if(plusdecode) {
1090 /* convert + to space */
1091 char *plus;
1092 for(plus = *part; *plus; ++plus) {
1093 if(*plus == '+')
1094 *plus = ' ';
1095 }
1096 }
1097 if(urldecode) {
1098 char *decoded;
1099 size_t dlen;
1100 CURLcode res = Curl_urldecode(NULL, *part, 0, &decoded, &dlen, TRUE);
1101 free(*part);
1102 if(res) {
1103 *part = NULL;
1104 return CURLUE_URLDECODE;
1105 }
1106 *part = decoded;
1107 }
1108 return CURLUE_OK;
1109 }
1110 else
1111 return ifmissing;
1112 }
1113
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1114 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1115 const char *part, unsigned int flags)
1116 {
1117 char **storep = NULL;
1118 long port = 0;
1119 bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
1120 bool plusencode = FALSE;
1121 bool urlskipslash = FALSE;
1122 bool appendquery = FALSE;
1123 bool equalsencode = FALSE;
1124
1125 if(!u)
1126 return CURLUE_BAD_HANDLE;
1127 if(!part) {
1128 /* setting a part to NULL clears it */
1129 switch(what) {
1130 case CURLUPART_URL:
1131 break;
1132 case CURLUPART_SCHEME:
1133 storep = &u->scheme;
1134 break;
1135 case CURLUPART_USER:
1136 storep = &u->user;
1137 break;
1138 case CURLUPART_PASSWORD:
1139 storep = &u->password;
1140 break;
1141 case CURLUPART_OPTIONS:
1142 storep = &u->options;
1143 break;
1144 case CURLUPART_HOST:
1145 storep = &u->host;
1146 break;
1147 case CURLUPART_PORT:
1148 storep = &u->port;
1149 break;
1150 case CURLUPART_PATH:
1151 storep = &u->path;
1152 break;
1153 case CURLUPART_QUERY:
1154 storep = &u->query;
1155 break;
1156 case CURLUPART_FRAGMENT:
1157 storep = &u->fragment;
1158 break;
1159 default:
1160 return CURLUE_UNKNOWN_PART;
1161 }
1162 if(storep && *storep) {
1163 free(*storep);
1164 *storep = NULL;
1165 }
1166 return CURLUE_OK;
1167 }
1168
1169 switch(what) {
1170 case CURLUPART_SCHEME:
1171 if(!(flags & CURLU_NON_SUPPORT_SCHEME) &&
1172 /* verify that it is a fine scheme */
1173 !Curl_builtin_scheme(part))
1174 return CURLUE_UNSUPPORTED_SCHEME;
1175 storep = &u->scheme;
1176 urlencode = FALSE; /* never */
1177 break;
1178 case CURLUPART_USER:
1179 storep = &u->user;
1180 break;
1181 case CURLUPART_PASSWORD:
1182 storep = &u->password;
1183 break;
1184 case CURLUPART_OPTIONS:
1185 storep = &u->options;
1186 break;
1187 case CURLUPART_HOST:
1188 storep = &u->host;
1189 break;
1190 case CURLUPART_PORT:
1191 urlencode = FALSE; /* never */
1192 port = strtol(part, NULL, 10); /* Port number must be decimal */
1193 if((port <= 0) || (port > 0xffff))
1194 return CURLUE_BAD_PORT_NUMBER;
1195 storep = &u->port;
1196 break;
1197 case CURLUPART_PATH:
1198 urlskipslash = TRUE;
1199 storep = &u->path;
1200 break;
1201 case CURLUPART_QUERY:
1202 plusencode = urlencode;
1203 appendquery = (flags & CURLU_APPENDQUERY)?1:0;
1204 equalsencode = appendquery;
1205 storep = &u->query;
1206 break;
1207 case CURLUPART_FRAGMENT:
1208 storep = &u->fragment;
1209 break;
1210 case CURLUPART_URL: {
1211 /*
1212 * Allow a new URL to replace the existing (if any) contents.
1213 *
1214 * If the existing contents is enough for a URL, allow a relative URL to
1215 * replace it.
1216 */
1217 CURLUcode result;
1218 char *oldurl;
1219 char *redired_url;
1220 CURLU *handle2;
1221
1222 if(Curl_is_absolute_url(part, NULL, MAX_SCHEME_LEN)) {
1223 handle2 = curl_url();
1224 if(!handle2)
1225 return CURLUE_OUT_OF_MEMORY;
1226 result = parseurl(part, handle2, flags);
1227 if(!result)
1228 mv_urlhandle(handle2, u);
1229 else
1230 curl_url_cleanup(handle2);
1231 return result;
1232 }
1233 /* extract the full "old" URL to do the redirect on */
1234 result = curl_url_get(u, CURLUPART_URL, &oldurl, flags);
1235 if(result) {
1236 /* couldn't get the old URL, just use the new! */
1237 handle2 = curl_url();
1238 if(!handle2)
1239 return CURLUE_OUT_OF_MEMORY;
1240 result = parseurl(part, handle2, flags);
1241 if(!result)
1242 mv_urlhandle(handle2, u);
1243 else
1244 curl_url_cleanup(handle2);
1245 return result;
1246 }
1247
1248 /* apply the relative part to create a new URL */
1249 redired_url = concat_url(oldurl, part);
1250 free(oldurl);
1251 if(!redired_url)
1252 return CURLUE_OUT_OF_MEMORY;
1253
1254 /* now parse the new URL */
1255 handle2 = curl_url();
1256 if(!handle2) {
1257 free(redired_url);
1258 return CURLUE_OUT_OF_MEMORY;
1259 }
1260 result = parseurl(redired_url, handle2, flags);
1261 free(redired_url);
1262 if(!result)
1263 mv_urlhandle(handle2, u);
1264 else
1265 curl_url_cleanup(handle2);
1266 return result;
1267 }
1268 default:
1269 return CURLUE_UNKNOWN_PART;
1270 }
1271 if(storep) {
1272 const char *newp = part;
1273 size_t nalloc = strlen(part);
1274
1275 if(urlencode) {
1276 const char *i;
1277 char *o;
1278 bool free_part = FALSE;
1279 char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
1280 if(!enc)
1281 return CURLUE_OUT_OF_MEMORY;
1282 if(plusencode) {
1283 /* space to plus */
1284 i = part;
1285 for(o = enc; *i; ++o, ++i)
1286 *o = (*i == ' ') ? '+' : *i;
1287 *o = 0; /* zero terminate */
1288 part = strdup(enc);
1289 if(!part) {
1290 free(enc);
1291 return CURLUE_OUT_OF_MEMORY;
1292 }
1293 free_part = TRUE;
1294 }
1295 for(i = part, o = enc; *i; i++) {
1296 if(Curl_isunreserved(*i) ||
1297 ((*i == '/') && urlskipslash) ||
1298 ((*i == '=') && equalsencode) ||
1299 ((*i == '+') && plusencode)) {
1300 if((*i == '=') && equalsencode)
1301 /* only skip the first equals sign */
1302 equalsencode = FALSE;
1303 *o = *i;
1304 o++;
1305 }
1306 else {
1307 msnprintf(o, 4, "%%%02x", *i);
1308 o += 3;
1309 }
1310 }
1311 *o = 0; /* zero terminate */
1312 newp = enc;
1313 if(free_part)
1314 free((char *)part);
1315 }
1316 else {
1317 char *p;
1318 newp = strdup(part);
1319 if(!newp)
1320 return CURLUE_OUT_OF_MEMORY;
1321 p = (char *)newp;
1322 while(*p) {
1323 /* make sure percent encoded are lower case */
1324 if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1325 (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1326 p[1] = (char)TOLOWER(p[1]);
1327 p[2] = (char)TOLOWER(p[2]);
1328 p += 3;
1329 }
1330 else
1331 p++;
1332 }
1333 }
1334
1335 if(appendquery) {
1336 /* Append the string onto the old query. Add a '&' separator if none is
1337 present at the end of the exsting query already */
1338 size_t querylen = u->query ? strlen(u->query) : 0;
1339 bool addamperand = querylen && (u->query[querylen -1] != '&');
1340 if(querylen) {
1341 size_t newplen = strlen(newp);
1342 char *p = malloc(querylen + addamperand + newplen + 1);
1343 if(!p) {
1344 free((char *)newp);
1345 return CURLUE_OUT_OF_MEMORY;
1346 }
1347 strcpy(p, u->query); /* original query */
1348 if(addamperand)
1349 p[querylen] = '&'; /* ampersand */
1350 strcpy(&p[querylen + addamperand], newp); /* new suffix */
1351 free((char *)newp);
1352 free(*storep);
1353 *storep = p;
1354 return CURLUE_OK;
1355 }
1356 }
1357
1358 free(*storep);
1359 *storep = (char *)newp;
1360 }
1361 /* set after the string, to make it not assigned if the allocation above
1362 fails */
1363 if(port)
1364 u->portnum = port;
1365 return CURLUE_OK;
1366 }
1367