• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright Joyent, Inc. and other Node contributors.
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to
5  * deal in the Software without restriction, including without limitation the
6  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7  * sell copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19  * IN THE SOFTWARE.
20  */
21 #include "url_parser.h"
22 #include <assert.h>
23 #include <stddef.h>
24 #include <ctype.h>
25 #include <string.h>
26 #include <limits.h>
27 
28 #ifndef BIT_AT
29 # define BIT_AT(a, i)                                                \
30   (!!((unsigned int) (a)[(unsigned int) (i) >> 3] &                  \
31    (1 << ((unsigned int) (i) & 7))))
32 #endif
33 
34 #if HTTP_PARSER_STRICT
35 # define T(v) 0
36 #else
37 # define T(v) v
38 #endif
39 
40 static const uint8_t normal_url_char[32] = {
41 /*   0 nul    1 soh    2 stx    3 etx    4 eot    5 enq    6 ack    7 bel  */
42         0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
43 /*   8 bs     9 ht    10 nl    11 vt    12 np    13 cr    14 so    15 si   */
44         0    | T(2)   |   0    |   0    | T(16)  |   0    |   0    |   0,
45 /*  16 dle   17 dc1   18 dc2   19 dc3   20 dc4   21 nak   22 syn   23 etb */
46         0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
47 /*  24 can   25 em    26 sub   27 esc   28 fs    29 gs    30 rs    31 us  */
48         0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
49 /*  32 sp    33  !    34  "    35  #    36  $    37  %    38  &    39  '  */
50         0    |   2    |   4    |   0    |   16   |   32   |   64   |  128,
51 /*  40  (    41  )    42  *    43  +    44  ,    45  -    46  .    47  /  */
52         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
53 /*  48  0    49  1    50  2    51  3    52  4    53  5    54  6    55  7  */
54         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
55 /*  56  8    57  9    58  :    59  ;    60  <    61  =    62  >    63  ?  */
56         1    |   2    |   4    |   8    |   16   |   32   |   64   |   0,
57 /*  64  @    65  A    66  B    67  C    68  D    69  E    70  F    71  G  */
58         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
59 /*  72  H    73  I    74  J    75  K    76  L    77  M    78  N    79  O  */
60         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
61 /*  80  P    81  Q    82  R    83  S    84  T    85  U    86  V    87  W  */
62         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
63 /*  88  X    89  Y    90  Z    91  [    92  \    93  ]    94  ^    95  _  */
64         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
65 /*  96  `    97  a    98  b    99  c   100  d   101  e   102  f   103  g  */
66         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
67 /* 104  h   105  i   106  j   107  k   108  l   109  m   110  n   111  o  */
68         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
69 /* 112  p   113  q   114  r   115  s   116  t   117  u   118  v   119  w  */
70         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
71 /* 120  x   121  y   122  z   123  {   124  |   125  }   126  ~   127 del */
72         1    |   2    |   4    |   8    |   16   |   32   |   64   |   0, };
73 
74 #undef T
75 
76 enum state
77   { s_dead = 1 /* important that this is > 0 */
78 
79   , s_start_req_or_res
80   , s_res_or_resp_H
81   , s_start_res
82   , s_res_H
83   , s_res_HT
84   , s_res_HTT
85   , s_res_HTTP
86   , s_res_http_major
87   , s_res_http_dot
88   , s_res_http_minor
89   , s_res_http_end
90   , s_res_first_status_code
91   , s_res_status_code
92   , s_res_status_start
93   , s_res_status
94   , s_res_line_almost_done
95 
96   , s_start_req
97 
98   , s_req_method
99   , s_req_spaces_before_url
100   , s_req_schema
101   , s_req_schema_slash
102   , s_req_schema_slash_slash
103   , s_req_server_start
104   , s_req_server
105   , s_req_server_with_at
106   , s_req_path
107   , s_req_query_string_start
108   , s_req_query_string
109   , s_req_fragment_start
110   , s_req_fragment
111   , s_req_http_start
112   , s_req_http_H
113   , s_req_http_HT
114   , s_req_http_HTT
115   , s_req_http_HTTP
116   , s_req_http_I
117   , s_req_http_IC
118   , s_req_http_major
119   , s_req_http_dot
120   , s_req_http_minor
121   , s_req_http_end
122   , s_req_line_almost_done
123 
124   , s_header_field_start
125   , s_header_field
126   , s_header_value_discard_ws
127   , s_header_value_discard_ws_almost_done
128   , s_header_value_discard_lws
129   , s_header_value_start
130   , s_header_value
131   , s_header_value_lws
132 
133   , s_header_almost_done
134 
135   , s_chunk_size_start
136   , s_chunk_size
137   , s_chunk_parameters
138   , s_chunk_size_almost_done
139 
140   , s_headers_almost_done
141   , s_headers_done
142 
143   /* Important: 's_headers_done' must be the last 'header' state. All
144    * states beyond this must be 'body' states. It is used for overflow
145    * checking. See the PARSING_HEADER() macro.
146    */
147 
148   , s_chunk_data
149   , s_chunk_data_almost_done
150   , s_chunk_data_done
151 
152   , s_body_identity
153   , s_body_identity_eof
154 
155   , s_message_done
156   };
157 
158 enum http_host_state
159   {
160     s_http_host_dead = 1
161   , s_http_userinfo_start
162   , s_http_userinfo
163   , s_http_host_start
164   , s_http_host_v6_start
165   , s_http_host
166   , s_http_host_v6
167   , s_http_host_v6_end
168   , s_http_host_v6_zone_start
169   , s_http_host_v6_zone
170   , s_http_host_port_start
171   , s_http_host_port
172 };
173 
174 /* Macros for character classes; depends on strict-mode  */
175 #define CR                  '\r'
176 #define LF                  '\n'
177 #define LOWER(c)            (unsigned char)(c | 0x20)
178 #define IS_ALPHA(c)         (LOWER(c) >= 'a' && LOWER(c) <= 'z')
179 #define IS_NUM(c)           ((c) >= '0' && (c) <= '9')
180 #define IS_ALPHANUM(c)      (IS_ALPHA(c) || IS_NUM(c))
181 #define IS_HEX(c)           (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
182 #define IS_MARK(c)          ((c) == '-' || (c) == '_' || (c) == '.' || \
183   (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \
184   (c) == ')')
185 #define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
186   (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
187   (c) == '$' || (c) == ',')
188 
189 #define STRICT_TOKEN(c)     ((c == ' ') ? 0 : tokens[(unsigned char)c])
190 
191 #if HTTP_PARSER_STRICT
192 #define TOKEN(c)            STRICT_TOKEN(c)
193 #define IS_URL_CHAR(c)      (BIT_AT(normal_url_char, (unsigned char)c))
194 #define IS_HOST_CHAR(c)     (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
195 #else
196 #define TOKEN(c)            tokens[(unsigned char)c]
197 #define IS_URL_CHAR(c)                                                         \
198   (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
199 #define IS_HOST_CHAR(c)                                                        \
200   (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
201 #endif
202 
203 /* Our URL parser.
204  *
205  * This is designed to be shared by http_parser_execute() for URL validation,
206  * hence it has a state transition + byte-for-byte interface. In addition, it
207  * is meant to be embedded in http_parser_parse_url(), which does the dirty
208  * work of turning state transitions URL components for its API.
209  *
210  * This function should only be invoked with non-space characters. It is
211  * assumed that the caller cares about (and can detect) the transition between
212  * URL and non-URL states by looking for these.
213  */
214 static enum state
parse_url_char(enum state s,const char ch)215 parse_url_char(enum state s, const char ch)
216 {
217   if (ch == ' ' || ch == '\r' || ch == '\n') {
218     return s_dead;
219   }
220 
221 #if HTTP_PARSER_STRICT
222   if (ch == '\t' || ch == '\f') {
223     return s_dead;
224   }
225 #endif
226 
227   switch (s) {
228     case s_req_spaces_before_url:
229       /* Proxied requests are followed by scheme of an absolute URI (alpha).
230        * All methods except CONNECT are followed by '/' or '*'.
231        */
232 
233       if (ch == '/' || ch == '*') {
234         return s_req_path;
235       }
236 
237       if (IS_ALPHA(ch)) {
238         return s_req_schema;
239       }
240 
241       break;
242 
243     case s_req_schema:
244       if (IS_ALPHA(ch)) {
245         return s;
246       }
247 
248       if (ch == ':') {
249         return s_req_schema_slash;
250       }
251 
252       break;
253 
254     case s_req_schema_slash:
255       if (ch == '/') {
256         return s_req_schema_slash_slash;
257       }
258 
259       break;
260 
261     case s_req_schema_slash_slash:
262       if (ch == '/') {
263         return s_req_server_start;
264       }
265 
266       break;
267 
268     case s_req_server_with_at:
269       if (ch == '@') {
270         return s_dead;
271       }
272 
273     /* fall through */
274     case s_req_server_start:
275     case s_req_server:
276       if (ch == '/') {
277         return s_req_path;
278       }
279 
280       if (ch == '?') {
281         return s_req_query_string_start;
282       }
283 
284       if (ch == '@') {
285         return s_req_server_with_at;
286       }
287 
288       if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
289         return s_req_server;
290       }
291 
292       break;
293 
294     case s_req_path:
295       if (IS_URL_CHAR(ch)) {
296         return s;
297       }
298 
299       switch (ch) {
300         case '?':
301           return s_req_query_string_start;
302 
303         case '#':
304           return s_req_fragment_start;
305       }
306 
307       break;
308 
309     case s_req_query_string_start:
310     case s_req_query_string:
311       if (IS_URL_CHAR(ch)) {
312         return s_req_query_string;
313       }
314 
315       switch (ch) {
316         case '?':
317           /* allow extra '?' in query string */
318           return s_req_query_string;
319 
320         case '#':
321           return s_req_fragment_start;
322       }
323 
324       break;
325 
326     case s_req_fragment_start:
327       if (IS_URL_CHAR(ch)) {
328         return s_req_fragment;
329       }
330 
331       switch (ch) {
332         case '?':
333           return s_req_fragment;
334 
335         case '#':
336           return s;
337       }
338 
339       break;
340 
341     case s_req_fragment:
342       if (IS_URL_CHAR(ch)) {
343         return s;
344       }
345 
346       switch (ch) {
347         case '?':
348         case '#':
349           return s;
350       }
351 
352       break;
353 
354     default:
355       break;
356   }
357 
358   /* We should never fall out of the switch above unless there's an error */
359   return s_dead;
360 }
361 
362 static enum http_host_state
http_parse_host_char(enum http_host_state s,const char ch)363 http_parse_host_char(enum http_host_state s, const char ch) {
364   switch(s) {
365     case s_http_userinfo:
366     case s_http_userinfo_start:
367       if (ch == '@') {
368         return s_http_host_start;
369       }
370 
371       if (IS_USERINFO_CHAR(ch)) {
372         return s_http_userinfo;
373       }
374       break;
375 
376     case s_http_host_start:
377       if (ch == '[') {
378         return s_http_host_v6_start;
379       }
380 
381       if (IS_HOST_CHAR(ch)) {
382         return s_http_host;
383       }
384 
385       break;
386 
387     case s_http_host:
388       if (IS_HOST_CHAR(ch)) {
389         return s_http_host;
390       }
391 
392     /* fall through */
393     case s_http_host_v6_end:
394       if (ch == ':') {
395         return s_http_host_port_start;
396       }
397 
398       break;
399 
400     case s_http_host_v6:
401       if (ch == ']') {
402         return s_http_host_v6_end;
403       }
404 
405     /* fall through */
406     case s_http_host_v6_start:
407       if (IS_HEX(ch) || ch == ':' || ch == '.') {
408         return s_http_host_v6;
409       }
410 
411       if (s == s_http_host_v6 && ch == '%') {
412         return s_http_host_v6_zone_start;
413       }
414       break;
415 
416     case s_http_host_v6_zone:
417       if (ch == ']') {
418         return s_http_host_v6_end;
419       }
420 
421     /* fall through */
422     case s_http_host_v6_zone_start:
423       /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
424       if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' || ch == '_' ||
425           ch == '~') {
426         return s_http_host_v6_zone;
427       }
428       break;
429 
430     case s_http_host_port:
431     case s_http_host_port_start:
432       if (IS_NUM(ch)) {
433         return s_http_host_port;
434       }
435 
436       break;
437 
438     default:
439       break;
440   }
441   return s_http_host_dead;
442 }
443 
444 static int
http_parse_host(const char * buf,struct http_parser_url * u,int found_at)445 http_parse_host(const char * buf, struct http_parser_url *u, int found_at) {
446   enum http_host_state s;
447 
448   const char *p;
449   size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
450 
451   assert(u->field_set & (1 << UF_HOST));
452 
453   u->field_data[UF_HOST].len = 0;
454 
455   s = found_at ? s_http_userinfo_start : s_http_host_start;
456 
457   for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
458     enum http_host_state new_s = http_parse_host_char(s, *p);
459 
460     if (new_s == s_http_host_dead) {
461       return 1;
462     }
463 
464     switch(new_s) {
465       case s_http_host:
466         if (s != s_http_host) {
467           u->field_data[UF_HOST].off = (uint16_t)(p - buf);
468         }
469         u->field_data[UF_HOST].len++;
470         break;
471 
472       case s_http_host_v6:
473         if (s != s_http_host_v6) {
474           u->field_data[UF_HOST].off = (uint16_t)(p - buf);
475         }
476         u->field_data[UF_HOST].len++;
477         break;
478 
479       case s_http_host_v6_zone_start:
480       case s_http_host_v6_zone:
481         u->field_data[UF_HOST].len++;
482         break;
483 
484       case s_http_host_port:
485         if (s != s_http_host_port) {
486           u->field_data[UF_PORT].off = (uint16_t)(p - buf);
487           u->field_data[UF_PORT].len = 0;
488           u->field_set |= (1 << UF_PORT);
489         }
490         u->field_data[UF_PORT].len++;
491         break;
492 
493       case s_http_userinfo:
494         if (s != s_http_userinfo) {
495           u->field_data[UF_USERINFO].off = (uint16_t)(p - buf);
496           u->field_data[UF_USERINFO].len = 0;
497           u->field_set |= (1 << UF_USERINFO);
498         }
499         u->field_data[UF_USERINFO].len++;
500         break;
501 
502       default:
503         break;
504     }
505     s = new_s;
506   }
507 
508   /* Make sure we don't end somewhere unexpected */
509   switch (s) {
510     case s_http_host_start:
511     case s_http_host_v6_start:
512     case s_http_host_v6:
513     case s_http_host_v6_zone_start:
514     case s_http_host_v6_zone:
515     case s_http_host_port_start:
516     case s_http_userinfo:
517     case s_http_userinfo_start:
518       return 1;
519     default:
520       break;
521   }
522 
523   return 0;
524 }
525 
526 void
http_parser_url_init(struct http_parser_url * u)527 http_parser_url_init(struct http_parser_url *u) {
528   memset(u, 0, sizeof(*u));
529 }
530 
531 int
http_parser_parse_url(const char * buf,size_t buflen,int is_connect,struct http_parser_url * u)532 http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
533                       struct http_parser_url *u)
534 {
535   enum state s;
536   const char *p;
537   enum http_parser_url_fields uf, old_uf;
538   int found_at = 0;
539 
540   if (buflen == 0) {
541     return 1;
542   }
543 
544   u->port = u->field_set = 0;
545   s = is_connect ? s_req_server_start : s_req_spaces_before_url;
546   old_uf = UF_MAX;
547 
548   for (p = buf; p < buf + buflen; p++) {
549     s = parse_url_char(s, *p);
550 
551     /* Figure out the next field that we're operating on */
552     switch (s) {
553       case s_dead:
554         return 1;
555 
556       /* Skip delimeters */
557       case s_req_schema_slash:
558       case s_req_schema_slash_slash:
559       case s_req_server_start:
560       case s_req_query_string_start:
561       case s_req_fragment_start:
562         continue;
563 
564       case s_req_schema:
565         uf = UF_SCHEMA;
566         break;
567 
568       case s_req_server_with_at:
569         found_at = 1;
570 
571       /* fall through */
572       case s_req_server:
573         uf = UF_HOST;
574         break;
575 
576       case s_req_path:
577         uf = UF_PATH;
578         break;
579 
580       case s_req_query_string:
581         uf = UF_QUERY;
582         break;
583 
584       case s_req_fragment:
585         uf = UF_FRAGMENT;
586         break;
587 
588       default:
589         assert(!"Unexpected state");
590         return 1;
591     }
592 
593     /* Nothing's changed; soldier on */
594     if (uf == old_uf) {
595       u->field_data[uf].len++;
596       continue;
597     }
598 
599     u->field_data[uf].off = (uint16_t)(p - buf);
600     u->field_data[uf].len = 1;
601 
602     u->field_set |= (1 << uf);
603     old_uf = uf;
604   }
605 
606   /* host must be present if there is a schema */
607   /* parsing http:///toto will fail */
608   if ((u->field_set & (1 << UF_SCHEMA)) &&
609       (u->field_set & (1 << UF_HOST)) == 0) {
610     return 1;
611   }
612 
613   if (u->field_set & (1 << UF_HOST)) {
614     if (http_parse_host(buf, u, found_at) != 0) {
615       return 1;
616     }
617   }
618 
619   /* CONNECT requests can only contain "hostname:port" */
620   if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
621     return 1;
622   }
623 
624   if (u->field_set & (1 << UF_PORT)) {
625     uint16_t off;
626     uint16_t len;
627     const char* p;
628     const char* end;
629     unsigned long v;
630 
631     off = u->field_data[UF_PORT].off;
632     len = u->field_data[UF_PORT].len;
633     end = buf + off + len;
634 
635     /* NOTE: The characters are already validated and are in the [0-9] range */
636     assert(off + len <= buflen && "Port number overflow");
637     v = 0;
638     for (p = buf + off; p < end; p++) {
639       v *= 10;
640       v += *p - '0';
641 
642       /* Ports have a max value of 2^16 */
643       if (v > 0xffff) {
644         return 1;
645       }
646     }
647 
648     u->port = (uint16_t) v;
649   }
650 
651   return 0;
652 }
653