• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* wget.c - Simple downloader to get the resource file from a HTTP server
2  *
3  * Copyright 2016 Lipi C.H. Lee <lipisoft@gmail.com>
4  * Copyright 2021 Eric Molitor <eric@molitor.org>
5  *
6  * Relevant sources of information
7  * -------------------------------
8  * HTTP 1.1: https://www.rfc-editor.org/rfc/rfc7230
9  * Chunked Encoding: https://www.rfc-editor.org/rfc/rfc7230#section-4.1
10  * UTF-8 Encoded Header Values https://www.rfc-editor.org/rfc/rfc5987
11  *
12  * Test URLs
13  * ---------
14  * Chunked Encoding: https://jigsaw.w3.org/HTTP/ChunkedScript
15  * Redirect 301: https://jigsaw.w3.org/HTTP/300/301.html
16  * Redirect 302: https://jigsaw.w3.org/HTTP/300/302.html
17  * TLS 1.0: https://tls-v1-0.badssl.com:1010/
18  * TLS 1.1: https://tls-v1-1.badssl.com:1011/
19  * TLS 1.2: https://tls-v1-2.badssl.com:1012/
20  * TLS 1.3: https://tls13.1d.pw/
21  * Transfer Encoding [gzip|deflate]: https://jigsaw.w3.org/HTTP/TE/bar.txt
22  *
23  *
24  * todo: Add support for configurable TLS versions
25  * todo: Add support for ftp
26  * todo: Add support for Transfer Encoding (gzip|deflate)
27  * todo: Add support for RFC5987
28 
29 USE_WGET(NEWTOY(wget, "<1>1(max-redirect)#<0=20d(debug)O(output-document):", TOYFLAG_USR|TOYFLAG_BIN))
30 
31 config WGET
32   bool "wget"
33   default n
34   help
35     usage: wget [OPTIONS]... [URL]
36         --max-redirect          maximum redirections allowed
37     -d, --debug                 print lots of debugging information
38     -O, --output-document=FILE  specify output filename
39 
40     examples:
41       wget http://www.example.com
42 
43 config WGET_LIBTLS
44   bool "Enable HTTPS support for wget via LibTLS"
45   default n
46   depends on WGET && !WGET_OPENSSL
47   help
48     Enable HTTPS support for wget by linking to LibTLS.
49     Supports using libtls, libretls or libtls-bearssl.
50 
51 config WGET_OPENSSL
52   bool "Enable HTTPS support for wget via OpenSSL"
53   default n
54   depends on WGET && !WGET_LIBTLS
55   help
56     Enable HTTPS support for wget by linking to OpenSSL.
57 */
58 
59 #define FOR_wget
60 #include "toys.h"
61 
62 #if CFG_WGET_LIBTLS
63 #define WGET_SSL 1
64 #include <tls.h>
65 #elif CFG_WGET_OPENSSL
66 #define WGET_SSL 1
67 #include <openssl/crypto.h>
68 #include <openssl/ssl.h>
69 #include <openssl/err.h>
70 #else
71 #define WGET_SSL 0
72 #endif
73 
74 #define WGET_FILENAME         "Content-Disposition: attachment; filename="
75 #define WGET_CHUNKED          "transfer-encoding: chunked"
76 #define WGET_LOCATION         "Location: "
77 #define WGET_LIBTLS_PROTOCOLS "tlsv1.2"
78 
79 #define WGET_IS_HTTP  (strncmp(TT.url, "http://", 7) == 0)
80 #define WGET_IS_HTTPS (WGET_SSL && (strncmp(TT.url, "https://", 8) == 0))
81 #define HTTPS (WGET_SSL && TT.https)
82 
GLOBALS(char * filename;long redirects;int sock,https;char * url;struct tls * tls;)83 GLOBALS(
84   char *filename;
85   long redirects;
86 
87   int sock, https;
88   char *url;
89 #if CFG_WGET_LIBTLS
90   struct tls *tls;
91 #elif CFG_WGET_OPENSSL
92   struct ssl_ctx_st *ctx;
93   struct ssl_st *ssl;
94 #endif
95 )
96 
97 static char *wget_strncaseafter(char *haystack, char *needle)
98 {
99   char *result = strcasestr(haystack, needle);
100   if (result) result = result + strlen(needle);
101   return result;
102 }
103 
104 // get http info in URL
wget_info(char * url,char ** host,char ** port,char ** path)105 static void wget_info(char *url, char **host, char **port, char **path)
106 {
107   char *ss = url;
108 
109   // Must start with case insensitive http:// or https://
110   if (strncasecmp(url, "http", 4)) url = 0;
111   else {
112     url += 4;
113     if ((TT.https = WGET_SSL && toupper(*url=='s'))) url++;
114     if (!strstart(&url, "://")) url = 0;
115   }
116   if (!url) error_exit("unsupported protocol: %s", ss);
117   if ((*path = strchr(*host = url, '/'))) *((*path)++) = 0;
118   else *path = "";
119 
120   // Get port number and trim literal IPv6 addresses
121   if (**host=='[' && (ss = strchr(++*host, ']'))) {
122     *ss++ = 0;
123     *port = (*ss==':') ? ++ss : 0;
124   } else if ((*port = strchr(*host, ':'))) *((*port)++) = 0;
125   if (!*port) *port = HTTPS ? "443" : "80";
126 }
127 
wget_connect(char * host,char * port)128 static void wget_connect(char *host, char *port)
129 {
130   if (WGET_IS_HTTP) {
131     struct addrinfo *a =
132         xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0);
133     TT.sock = xconnectany(a);
134   } else if (WGET_IS_HTTPS) {
135 #if CFG_WGET_LIBTLS
136     struct tls_config *cfg = NULL;
137     uint32_t protocols;
138     if ((TT.tls = tls_client()) == NULL)
139       error_exit("tls_client: %s", tls_error(TT.tls));
140     if ((cfg = tls_config_new()) == NULL)
141       error_exit("tls_config_new: %s", tls_config_error(cfg));
142     if (tls_config_parse_protocols(&protocols, WGET_LIBTLS_PROTOCOLS) != 0)
143       error_exit("tls_config_parse_protocols");
144     if (tls_config_set_protocols(cfg, protocols) != 0)
145       error_exit("tls_config_set_protocols: %s", tls_config_error(cfg));
146     if (tls_configure(TT.tls, cfg) != 0)
147       error_exit("tls_configure: %s", tls_error(TT.tls));
148     tls_config_free(cfg);
149 
150     if (tls_connect(TT.tls, host, port) != 0)
151       error_exit("tls_connect: %s", tls_error(TT.tls));
152 #elif CFG_WGET_OPENSSL
153     SSL_library_init();
154     OpenSSL_add_all_algorithms();
155     SSL_load_error_strings();
156     ERR_load_crypto_strings();
157 
158     TT.ctx = SSL_CTX_new(TLS_client_method());
159     if (!TT.ctx) error_exit("SSL_CTX_new");
160 
161     struct addrinfo *a =
162         xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0);
163     TT.sock = xconnectany(a);
164 
165     TT.ssl = SSL_new(TT.ctx);
166     if (!TT.ssl)
167       error_exit("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL));
168 
169     if (!SSL_set_tlsext_host_name(TT.ssl, host))
170       error_exit("SSL_set_tlsext_host_name: %s",
171                  ERR_error_string(ERR_get_error(), NULL));
172 
173     SSL_set_fd(TT.ssl, TT.sock);
174     if (SSL_connect(TT.ssl) == -1)
175       error_exit("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL));
176 
177     if (FLAG(d)) printf("TLS: %s\n", SSL_get_cipher(TT.ssl));
178 #endif
179   } else error_exit("unsupported protocol");
180 }
181 
wget_read(void * buf,size_t len)182 static size_t wget_read(void *buf, size_t len)
183 {
184   if (WGET_IS_HTTP) return xread(TT.sock, buf, len);
185   else if (WGET_IS_HTTPS) {
186 #if CFG_WGET_LIBTLS
187    ssize_t ret = tls_read(TT.tls, buf, len);
188    if (ret < 0) error_exit("tls_read: %s", tls_error(TT.tls));
189    return ret;
190 #elif CFG_WGET_OPENSSL
191    int ret = SSL_read(TT.ssl, buf, (int) len);
192    if (ret < 0)
193      error_exit("SSL_read: %s", ERR_error_string(ERR_get_error(), NULL));
194    return ret;
195 #endif
196   } else error_exit("unsupported protocol");
197 }
198 
wget_write(void * buf,size_t len)199 static void wget_write(void *buf, size_t len)
200 {
201   if (WGET_IS_HTTP) {
202     xwrite(TT.sock, buf, len);
203   } else if (WGET_IS_HTTPS) {
204 #if CFG_WGET_LIBTLS
205     if (len != tls_write(TT.tls, buf, len))
206       error_exit("tls_write: %s", tls_error(TT.tls));
207 #elif CFG_WGET_OPENSSL
208     if (len != SSL_write(TT.ssl, buf, (int) len))
209       error_exit("SSL_write: %s", ERR_error_string(ERR_get_error(), NULL));
210 #endif
211   } else error_exit("unsupported protocol");
212 }
213 
wget_close()214 static void wget_close()
215 {
216   if (TT.sock) {
217       xclose(TT.sock);
218       TT.sock = 0;
219   }
220 
221 #if CFG_WGET_LIBTLS
222   if (TT.tls) {
223     tls_close(TT.tls);
224     tls_free(TT.tls);
225     TT.tls = NULL;
226   }
227 #elif CFG_WGET_OPENSSL
228   if (TT.ssl) {
229     SSL_shutdown(TT.ssl);
230     SSL_free(TT.ssl);
231     TT.ssl = NULL;
232   }
233 
234   if (TT.ctx) {
235     SSL_CTX_free(TT.ctx);
236     TT.ctx = NULL;
237   }
238 #endif
239 }
240 
wget_find_header(char * header,char * val)241 static char* wget_find_header(char *header, char *val) {
242   char *v= wget_strncaseafter(header, val);
243   return v;
244 }
245 
wget_has_header(char * header,char * val)246 static int wget_has_header(char *header, char *val)
247 {
248   return wget_find_header(header, val) != NULL;
249 }
250 
wget_redirect(char * header)251 static char *wget_redirect(char *header)
252 {
253   char *redir = wget_find_header(header, WGET_LOCATION);
254   if (!redir) error_exit("could not parse redirect URL");
255   return xstrndup(redir, stridx(redir, '\r'));
256 }
257 
wget_filename(char * header,char * path)258 static char *wget_filename(char *header, char *path)
259 {
260   char *f = wget_find_header(header, WGET_FILENAME);
261   if (f) strchr(f, '\r')[0] = '\0';
262 
263   if (!f && strchr(path, '/')) f = getbasename(path);
264   if (!f || !(*f) ) f = "index.html";
265 
266   return f;
267 }
268 
wget_main(void)269 void wget_main(void)
270 {
271   long status = 0;
272   size_t len, c_len = 0;
273   int fd, chunked;
274   char *body, *index, *host, *port, *path;
275   char agent[] = "toybox wget/" TOYBOX_VERSION;
276 
277   TT.url = xstrdup(toys.optargs[0]);
278 
279   for (;status != 200; TT.redirects--) {
280     if (TT.redirects < 0) error_exit("Too many redirects");
281 
282     wget_info(TT.url, &host, &port, &path);
283 
284     sprintf(toybuf, "GET /%s HTTP/1.1\r\nHost: %s\r\n"
285                     "User-Agent: %s\r\nConnection: close\r\n\r\n",
286                     path, host, agent);
287     if (FLAG(d)) printf("--- Request\n%s", toybuf);
288 
289     wget_connect(host, port);
290     wget_write(toybuf, strlen(toybuf));
291 
292     // Greedily read the HTTP response until either complete or toybuf is full
293     index = toybuf;
294     while ((len = wget_read(index, sizeof(toybuf) - (index - toybuf))) > 0)
295       index += len;
296 
297     //Process the response such that
298     //  Valid ranges  toybuf[0...index)      valid length is (index - toybuf)
299     //  Header ranges toybuf[0...body)       header length strlen(toybuf)
300     //  Remnant Body  toybuf[body...index)   valid remnant body length is len
301     //
302     // Per RFC7230 the header cannot contain a NUL octet so we NUL terminate at
303     // the footer of the header. This allows for normal string functions to be
304     // used when processing the header.
305     body = memmem(toybuf, index - toybuf, "\r\n\r\n", 4);
306     if (!body) error_exit("response header too large");
307     body[0] = '\0'; // NUL terminate the headers
308     body += 4; // Skip to the head of body
309     len = index - body; // Adjust len to be body length
310     if (FLAG(d)) printf("--- Response\n%s\n\n", toybuf);
311 
312     status = strtol(strafter(toybuf, " "), NULL, 10);
313     if ((status == 301) || (status == 302)) {
314       free(TT.url);
315       TT.url = wget_redirect(toybuf);
316       wget_close();
317     } else if (status != 200) error_exit("response: %ld", status);
318   }
319 
320   if (!FLAG(O)) {
321     TT.filename = wget_filename(toybuf, path);
322     if (!access(TT.filename, F_OK))
323       error_exit("%s already exists", TT.filename);
324   }
325   fd = xcreate(TT.filename, (O_WRONLY|O_CREAT|O_TRUNC), 0644);
326 
327   chunked = wget_has_header(toybuf, WGET_CHUNKED);
328 
329   // If chunked we offset the first buffer by 2 character, meaning it is
330   // pointing at half of the header boundary, aka '\r\n'. This simplifies
331   // parsing of the first c_len length by allowing the do while loop to fall
332   // through on the first iteration and parse the first c_len size.
333   if (chunked) {
334     len = len + 2;
335     memmove(toybuf, body - 2, len);
336   } else {
337     memmove(toybuf, body, len);
338   }
339 
340   // len is the size remaining in toybuf
341   // c_len is the size of the remaining bytes in the current chunk
342   do {
343     if (chunked) {
344       if (c_len > 0) { // We have an incomplete c_len to write
345         if (len <= c_len) { // Buffer is less than the c_len so full write
346           xwrite(fd, toybuf, len);
347           c_len = c_len - len;
348           len = 0;
349         } else { // Buffer is larger than the c_len so partial write
350           xwrite(fd, toybuf, c_len);
351           len = len - c_len;
352           memmove(toybuf, toybuf + c_len, len);
353           c_len = 0;
354         }
355       }
356 
357       // If len is less than 2 we can't validate the chunk boundary so fall
358       // through and go read more into toybuf.
359       if ((c_len == 0) && (len > 2)) {
360         char *c;
361         if (strncmp(toybuf, "\r\n", 2) != 0) error_exit("chunk boundary");
362 
363         // If we can't find the end of the new chunk signature fall through and
364         // read more into toybuf.
365         c = memmem(toybuf + 2, len - 2, "\r\n",2);
366         if (c) {
367           c_len = strtol(toybuf + 2, NULL, 16);
368           if (c_len == 0) goto exit; // A c_len of zero means we are complete
369           len = len - (c - toybuf) - 2;
370           memmove(toybuf, c + 2, len);
371         }
372       }
373 
374       if (len == sizeof(toybuf)) error_exit("chunk overflow");
375     } else {
376       xwrite(fd, toybuf, len);
377       len = 0;
378     }
379   } while ((len += wget_read(toybuf + len, sizeof(toybuf) - len)) > 0);
380 
381   exit:
382   wget_close();
383   free(TT.url);
384 }
385