1 /* Copyright Joyent, Inc. and other Node contributors.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to
5 * deal in the Software without restriction, including without limitation the
6 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 * sell copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 * IN THE SOFTWARE.
20 */
21 #include "url_parser.h"
22 #include <assert.h>
23 #include <stddef.h>
24 #include <ctype.h>
25 #include <string.h>
26 #include <limits.h>
27
28 #ifndef BIT_AT
29 # define BIT_AT(a, i) \
30 (!!((unsigned int) (a)[(unsigned int) (i) >> 3] & \
31 (1 << ((unsigned int) (i) & 7))))
32 #endif
33
34 #if HTTP_PARSER_STRICT
35 # define T(v) 0
36 #else
37 # define T(v) v
38 #endif
39
40 static const uint8_t normal_url_char[32] = {
41 /* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */
42 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
43 /* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */
44 0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0,
45 /* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
46 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
47 /* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */
48 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
49 /* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */
50 0 | 2 | 4 | 0 | 16 | 32 | 64 | 128,
51 /* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */
52 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
53 /* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */
54 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
55 /* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */
56 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
57 /* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */
58 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
59 /* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */
60 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
61 /* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */
62 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
63 /* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */
64 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
65 /* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */
66 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
67 /* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */
68 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
69 /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */
70 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
71 /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */
72 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, };
73
74 #undef T
75
76 enum state
77 { s_dead = 1 /* important that this is > 0 */
78
79 , s_start_req_or_res
80 , s_res_or_resp_H
81 , s_start_res
82 , s_res_H
83 , s_res_HT
84 , s_res_HTT
85 , s_res_HTTP
86 , s_res_http_major
87 , s_res_http_dot
88 , s_res_http_minor
89 , s_res_http_end
90 , s_res_first_status_code
91 , s_res_status_code
92 , s_res_status_start
93 , s_res_status
94 , s_res_line_almost_done
95
96 , s_start_req
97
98 , s_req_method
99 , s_req_spaces_before_url
100 , s_req_schema
101 , s_req_schema_slash
102 , s_req_schema_slash_slash
103 , s_req_server_start
104 , s_req_server
105 , s_req_server_with_at
106 , s_req_path
107 , s_req_query_string_start
108 , s_req_query_string
109 , s_req_fragment_start
110 , s_req_fragment
111 , s_req_http_start
112 , s_req_http_H
113 , s_req_http_HT
114 , s_req_http_HTT
115 , s_req_http_HTTP
116 , s_req_http_I
117 , s_req_http_IC
118 , s_req_http_major
119 , s_req_http_dot
120 , s_req_http_minor
121 , s_req_http_end
122 , s_req_line_almost_done
123
124 , s_header_field_start
125 , s_header_field
126 , s_header_value_discard_ws
127 , s_header_value_discard_ws_almost_done
128 , s_header_value_discard_lws
129 , s_header_value_start
130 , s_header_value
131 , s_header_value_lws
132
133 , s_header_almost_done
134
135 , s_chunk_size_start
136 , s_chunk_size
137 , s_chunk_parameters
138 , s_chunk_size_almost_done
139
140 , s_headers_almost_done
141 , s_headers_done
142
143 /* Important: 's_headers_done' must be the last 'header' state. All
144 * states beyond this must be 'body' states. It is used for overflow
145 * checking. See the PARSING_HEADER() macro.
146 */
147
148 , s_chunk_data
149 , s_chunk_data_almost_done
150 , s_chunk_data_done
151
152 , s_body_identity
153 , s_body_identity_eof
154
155 , s_message_done
156 };
157
158 enum http_host_state
159 {
160 s_http_host_dead = 1
161 , s_http_userinfo_start
162 , s_http_userinfo
163 , s_http_host_start
164 , s_http_host_v6_start
165 , s_http_host
166 , s_http_host_v6
167 , s_http_host_v6_end
168 , s_http_host_v6_zone_start
169 , s_http_host_v6_zone
170 , s_http_host_port_start
171 , s_http_host_port
172 };
173
174 /* Macros for character classes; depends on strict-mode */
175 #define CR '\r'
176 #define LF '\n'
177 #define LOWER(c) (unsigned char)(c | 0x20)
178 #define IS_ALPHA(c) (LOWER(c) >= 'a' && LOWER(c) <= 'z')
179 #define IS_NUM(c) ((c) >= '0' && (c) <= '9')
180 #define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
181 #define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
182 #define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \
183 (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \
184 (c) == ')')
185 #define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
186 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
187 (c) == '$' || (c) == ',')
188
189 #define STRICT_TOKEN(c) ((c == ' ') ? 0 : tokens[(unsigned char)c])
190
191 #if HTTP_PARSER_STRICT
192 #define TOKEN(c) STRICT_TOKEN(c)
193 #define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c))
194 #define IS_HOST_CHAR(c) (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
195 #else
196 #define TOKEN(c) tokens[(unsigned char)c]
197 #define IS_URL_CHAR(c) \
198 (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
199 #define IS_HOST_CHAR(c) \
200 (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
201 #endif
202
203 /* Our URL parser.
204 *
205 * This is designed to be shared by http_parser_execute() for URL validation,
206 * hence it has a state transition + byte-for-byte interface. In addition, it
207 * is meant to be embedded in http_parser_parse_url(), which does the dirty
208 * work of turning state transitions URL components for its API.
209 *
210 * This function should only be invoked with non-space characters. It is
211 * assumed that the caller cares about (and can detect) the transition between
212 * URL and non-URL states by looking for these.
213 */
214 static enum state
parse_url_char(enum state s,const char ch)215 parse_url_char(enum state s, const char ch)
216 {
217 if (ch == ' ' || ch == '\r' || ch == '\n') {
218 return s_dead;
219 }
220
221 #if HTTP_PARSER_STRICT
222 if (ch == '\t' || ch == '\f') {
223 return s_dead;
224 }
225 #endif
226
227 switch (s) {
228 case s_req_spaces_before_url:
229 /* Proxied requests are followed by scheme of an absolute URI (alpha).
230 * All methods except CONNECT are followed by '/' or '*'.
231 */
232
233 if (ch == '/' || ch == '*') {
234 return s_req_path;
235 }
236
237 if (IS_ALPHA(ch)) {
238 return s_req_schema;
239 }
240
241 break;
242
243 case s_req_schema:
244 if (IS_ALPHA(ch)) {
245 return s;
246 }
247
248 if (ch == ':') {
249 return s_req_schema_slash;
250 }
251
252 break;
253
254 case s_req_schema_slash:
255 if (ch == '/') {
256 return s_req_schema_slash_slash;
257 }
258
259 break;
260
261 case s_req_schema_slash_slash:
262 if (ch == '/') {
263 return s_req_server_start;
264 }
265
266 break;
267
268 case s_req_server_with_at:
269 if (ch == '@') {
270 return s_dead;
271 }
272
273 /* fall through */
274 case s_req_server_start:
275 case s_req_server:
276 if (ch == '/') {
277 return s_req_path;
278 }
279
280 if (ch == '?') {
281 return s_req_query_string_start;
282 }
283
284 if (ch == '@') {
285 return s_req_server_with_at;
286 }
287
288 if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
289 return s_req_server;
290 }
291
292 break;
293
294 case s_req_path:
295 if (IS_URL_CHAR(ch)) {
296 return s;
297 }
298
299 switch (ch) {
300 case '?':
301 return s_req_query_string_start;
302
303 case '#':
304 return s_req_fragment_start;
305 }
306
307 break;
308
309 case s_req_query_string_start:
310 case s_req_query_string:
311 if (IS_URL_CHAR(ch)) {
312 return s_req_query_string;
313 }
314
315 switch (ch) {
316 case '?':
317 /* allow extra '?' in query string */
318 return s_req_query_string;
319
320 case '#':
321 return s_req_fragment_start;
322 }
323
324 break;
325
326 case s_req_fragment_start:
327 if (IS_URL_CHAR(ch)) {
328 return s_req_fragment;
329 }
330
331 switch (ch) {
332 case '?':
333 return s_req_fragment;
334
335 case '#':
336 return s;
337 }
338
339 break;
340
341 case s_req_fragment:
342 if (IS_URL_CHAR(ch)) {
343 return s;
344 }
345
346 switch (ch) {
347 case '?':
348 case '#':
349 return s;
350 }
351
352 break;
353
354 default:
355 break;
356 }
357
358 /* We should never fall out of the switch above unless there's an error */
359 return s_dead;
360 }
361
362 static enum http_host_state
http_parse_host_char(enum http_host_state s,const char ch)363 http_parse_host_char(enum http_host_state s, const char ch) {
364 switch(s) {
365 case s_http_userinfo:
366 case s_http_userinfo_start:
367 if (ch == '@') {
368 return s_http_host_start;
369 }
370
371 if (IS_USERINFO_CHAR(ch)) {
372 return s_http_userinfo;
373 }
374 break;
375
376 case s_http_host_start:
377 if (ch == '[') {
378 return s_http_host_v6_start;
379 }
380
381 if (IS_HOST_CHAR(ch)) {
382 return s_http_host;
383 }
384
385 break;
386
387 case s_http_host:
388 if (IS_HOST_CHAR(ch)) {
389 return s_http_host;
390 }
391
392 /* fall through */
393 case s_http_host_v6_end:
394 if (ch == ':') {
395 return s_http_host_port_start;
396 }
397
398 break;
399
400 case s_http_host_v6:
401 if (ch == ']') {
402 return s_http_host_v6_end;
403 }
404
405 /* fall through */
406 case s_http_host_v6_start:
407 if (IS_HEX(ch) || ch == ':' || ch == '.') {
408 return s_http_host_v6;
409 }
410
411 if (s == s_http_host_v6 && ch == '%') {
412 return s_http_host_v6_zone_start;
413 }
414 break;
415
416 case s_http_host_v6_zone:
417 if (ch == ']') {
418 return s_http_host_v6_end;
419 }
420
421 /* fall through */
422 case s_http_host_v6_zone_start:
423 /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
424 if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' || ch == '_' ||
425 ch == '~') {
426 return s_http_host_v6_zone;
427 }
428 break;
429
430 case s_http_host_port:
431 case s_http_host_port_start:
432 if (IS_NUM(ch)) {
433 return s_http_host_port;
434 }
435
436 break;
437
438 default:
439 break;
440 }
441 return s_http_host_dead;
442 }
443
444 static int
http_parse_host(const char * buf,struct http_parser_url * u,int found_at)445 http_parse_host(const char * buf, struct http_parser_url *u, int found_at) {
446 enum http_host_state s;
447
448 const char *p;
449 size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
450
451 assert(u->field_set & (1 << UF_HOST));
452
453 u->field_data[UF_HOST].len = 0;
454
455 s = found_at ? s_http_userinfo_start : s_http_host_start;
456
457 for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
458 enum http_host_state new_s = http_parse_host_char(s, *p);
459
460 if (new_s == s_http_host_dead) {
461 return 1;
462 }
463
464 switch(new_s) {
465 case s_http_host:
466 if (s != s_http_host) {
467 u->field_data[UF_HOST].off = (uint16_t)(p - buf);
468 }
469 u->field_data[UF_HOST].len++;
470 break;
471
472 case s_http_host_v6:
473 if (s != s_http_host_v6) {
474 u->field_data[UF_HOST].off = (uint16_t)(p - buf);
475 }
476 u->field_data[UF_HOST].len++;
477 break;
478
479 case s_http_host_v6_zone_start:
480 case s_http_host_v6_zone:
481 u->field_data[UF_HOST].len++;
482 break;
483
484 case s_http_host_port:
485 if (s != s_http_host_port) {
486 u->field_data[UF_PORT].off = (uint16_t)(p - buf);
487 u->field_data[UF_PORT].len = 0;
488 u->field_set |= (1 << UF_PORT);
489 }
490 u->field_data[UF_PORT].len++;
491 break;
492
493 case s_http_userinfo:
494 if (s != s_http_userinfo) {
495 u->field_data[UF_USERINFO].off = (uint16_t)(p - buf);
496 u->field_data[UF_USERINFO].len = 0;
497 u->field_set |= (1 << UF_USERINFO);
498 }
499 u->field_data[UF_USERINFO].len++;
500 break;
501
502 default:
503 break;
504 }
505 s = new_s;
506 }
507
508 /* Make sure we don't end somewhere unexpected */
509 switch (s) {
510 case s_http_host_start:
511 case s_http_host_v6_start:
512 case s_http_host_v6:
513 case s_http_host_v6_zone_start:
514 case s_http_host_v6_zone:
515 case s_http_host_port_start:
516 case s_http_userinfo:
517 case s_http_userinfo_start:
518 return 1;
519 default:
520 break;
521 }
522
523 return 0;
524 }
525
526 void
http_parser_url_init(struct http_parser_url * u)527 http_parser_url_init(struct http_parser_url *u) {
528 memset(u, 0, sizeof(*u));
529 }
530
531 int
http_parser_parse_url(const char * buf,size_t buflen,int is_connect,struct http_parser_url * u)532 http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
533 struct http_parser_url *u)
534 {
535 enum state s;
536 const char *p;
537 enum http_parser_url_fields uf, old_uf;
538 int found_at = 0;
539
540 if (buflen == 0) {
541 return 1;
542 }
543
544 u->port = u->field_set = 0;
545 s = is_connect ? s_req_server_start : s_req_spaces_before_url;
546 old_uf = UF_MAX;
547
548 for (p = buf; p < buf + buflen; p++) {
549 s = parse_url_char(s, *p);
550
551 /* Figure out the next field that we're operating on */
552 switch (s) {
553 case s_dead:
554 return 1;
555
556 /* Skip delimeters */
557 case s_req_schema_slash:
558 case s_req_schema_slash_slash:
559 case s_req_server_start:
560 case s_req_query_string_start:
561 case s_req_fragment_start:
562 continue;
563
564 case s_req_schema:
565 uf = UF_SCHEMA;
566 break;
567
568 case s_req_server_with_at:
569 found_at = 1;
570
571 /* fall through */
572 case s_req_server:
573 uf = UF_HOST;
574 break;
575
576 case s_req_path:
577 uf = UF_PATH;
578 break;
579
580 case s_req_query_string:
581 uf = UF_QUERY;
582 break;
583
584 case s_req_fragment:
585 uf = UF_FRAGMENT;
586 break;
587
588 default:
589 assert(!"Unexpected state");
590 return 1;
591 }
592
593 /* Nothing's changed; soldier on */
594 if (uf == old_uf) {
595 u->field_data[uf].len++;
596 continue;
597 }
598
599 u->field_data[uf].off = (uint16_t)(p - buf);
600 u->field_data[uf].len = 1;
601
602 u->field_set |= (1 << uf);
603 old_uf = uf;
604 }
605
606 /* host must be present if there is a schema */
607 /* parsing http:///toto will fail */
608 if ((u->field_set & (1 << UF_SCHEMA)) &&
609 (u->field_set & (1 << UF_HOST)) == 0) {
610 return 1;
611 }
612
613 if (u->field_set & (1 << UF_HOST)) {
614 if (http_parse_host(buf, u, found_at) != 0) {
615 return 1;
616 }
617 }
618
619 /* CONNECT requests can only contain "hostname:port" */
620 if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
621 return 1;
622 }
623
624 if (u->field_set & (1 << UF_PORT)) {
625 uint16_t off;
626 uint16_t len;
627 const char* p;
628 const char* end;
629 unsigned long v;
630
631 off = u->field_data[UF_PORT].off;
632 len = u->field_data[UF_PORT].len;
633 end = buf + off + len;
634
635 /* NOTE: The characters are already validated and are in the [0-9] range */
636 assert(off + len <= buflen && "Port number overflow");
637 v = 0;
638 for (p = buf + off; p < end; p++) {
639 v *= 10;
640 v += *p - '0';
641
642 /* Ports have a max value of 2^16 */
643 if (v > 0xffff) {
644 return 1;
645 }
646 }
647
648 u->port = (uint16_t) v;
649 }
650
651 return 0;
652 }
653