1 /*
2 * Copyright 2014-2022 The GmSSL Project. All Rights Reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the License); you may
5 * not use this file except in compliance with the License.
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 */
9
10
11 #include "url_parser.h"
12 #include <errno.h>
13 #include <stdlib.h>
14 #include <string.h>
15
_strnstr(const char * s,size_t s_len,const char * needle)16 static const char *_strnstr(const char *s, size_t s_len, const char *needle)
17 {
18 const char *end = s + s_len;
19 size_t needle_len = strlen(needle);
20 const char *p;
21
22 p = s;
23 while (p < end - needle_len + 1) {
24 if (strncmp(p, needle, needle_len) == 0) {
25 return p;
26 }
27 p++;
28 }
29
30 return NULL;
31 }
32
find_chars(const char * s,size_t s_len,const char * chars)33 static const char *find_chars(const char *s, size_t s_len, const char *chars)
34 {
35 const char *end = s + s_len;
36 size_t chars_n = strlen(chars);
37 const char *p;
38 int i;
39
40 p = s;
41 while (p < end) {
42 for (i = 0 ; i < chars_n ; i++) {
43 if (*p == chars[i]) {
44 return p;
45 }
46 }
47 p++;
48 }
49
50 return NULL;
51 }
52
find_chars_reverse(const char * s,size_t s_len,const char * chars)53 static const char *find_chars_reverse(const char *s, size_t s_len, const char *chars)
54 {
55 const char *end = s + s_len;
56 size_t chars_n = strlen(chars);
57 const char *p;
58 int i;
59
60 p = end - 1;
61 while (p >= s) {
62 for (i = 0 ; i < chars_n ; i++) {
63 if (*p == chars[i]) {
64 return p;
65 }
66 }
67 p--;
68 }
69
70 return NULL;
71 }
72
is_alpha(char c)73 static int is_alpha(char c)
74 {
75 if ((c >= 'a' && c <= 'z') ||
76 (c >= 'A' && c <= 'Z')) {
77 return 1;
78 }
79 return 0;
80 }
81
is_digit(char c)82 static int is_digit(char c)
83 {
84 if (c >= '0' && c <= '9') {
85 return 1;
86 }
87 return 0;
88 }
89
is_control(char c)90 static int is_control(char c)
91 {
92 if ((c >= 0x00 && c <= 0x1f) ||
93 c == 0x7f) {
94 return 1;
95 }
96 return 0;
97 }
98
lookup_scheme(const char * s)99 static const char *lookup_scheme(const char *s)
100 {
101 const char *p = s;
102 char c;
103
104 if (strlen(s) == 0) {
105 return NULL;
106 }
107
108 if (!is_alpha(*p)) {
109 return NULL;
110 }
111 p++;
112
113 while (*p != '\0') {
114 c = *p;
115 if (c == ':') {
116 return p;
117 }
118 if (!is_alpha(c) &&
119 !is_digit(c) &&
120 c != '+' &&
121 c != '-' &&
122 c != '.') {
123 return NULL;
124 }
125 p++;
126 }
127 return NULL;
128 }
129
parse_user_password(const char * s,size_t s_len,URL_COMPONENTS * c)130 static int parse_user_password(const char *s, size_t s_len, URL_COMPONENTS *c)
131 {
132 const char *end = s + s_len;
133 const char *found;
134
135 found = _strnstr(s, s_len, ":");
136 if (found) {
137 c->user = strndup(s, found - s);
138 if (c->user == NULL) {
139 return -1; /* ENOMEM */
140 }
141 c->password = strndup(found + 1, end - found - 1);
142 if (c->password == NULL) {
143 return -1; /* ENOMEM */
144 }
145 } else {
146 c->user = strndup(s, s_len);
147 if (c->user == NULL) {
148 return -1; /* ENOMEM */
149 }
150 }
151
152 return 0;
153 }
154
parse_authority(const char * s,size_t s_len,URL_COMPONENTS * c)155 static int parse_authority(const char *s, size_t s_len, URL_COMPONENTS *c)
156 {
157 const char *end = s + s_len;
158 const char *p, *found, *host_start, *host_end;
159 int port;
160
161 c->port = -1;
162
163 if (s_len == 0) { /* empty authority */
164 return 0;
165 }
166
167 found = _strnstr(s, s_len, "@");
168 if (found) {
169 if (parse_user_password(s, found - s, c) == -1) {
170 return -1;
171 }
172
173 host_start = found + 1;
174 } else {
175 host_start = s;
176 }
177
178 if (*host_start == '[') {
179 /* IP-literal host */
180 if (find_chars(host_start + 1, end - host_start - 1, "[")) {
181 errno = EINVAL;
182 return -1;
183 }
184 host_end = find_chars(host_start + 1, end - host_start - 1, "]");
185 if (!host_end) {
186 errno = EINVAL;
187 return -1;
188 }
189 /* The next character of ']' is termination or ':'. */
190 if (host_end + 1 != end && host_end[1] != ':') {
191 errno = EINVAL;
192 return -1;
193 }
194 host_end++;
195 } else {
196 /* IPv4address / reg-name host */
197 host_end = find_chars_reverse(host_start, end - host_start, ":");
198 if (host_end == NULL) {
199 host_end = end;
200 }
201 if (find_chars(host_start, host_end - host_start, "[]")) {
202 errno = EINVAL;
203 return -1;
204 }
205 }
206 if (find_chars(host_start, host_end - host_start, " ")) {
207 errno = EINVAL;
208 return -1;
209 }
210
211 /* ASSERT: host_end == end or *host_end == ':' */
212
213 if (host_end == end) {
214 /* without port number */
215 if (host_start == end) { /* empty host */
216 errno = EINVAL;
217 return -1;
218 }
219 c->host = strndup(host_start, end - host_start);
220 if (c->host == NULL) {
221 return -1; /* ENOMEM */
222 }
223 return 0;
224 }
225
226 /* ASSERT: *host_end == ':' */
227
228 /* host and port */
229
230 if (host_start == host_end) { /* empty host */
231 errno = EINVAL;
232 return -1;
233 }
234
235 if (host_end + 1 < end) {
236 p = host_end + 1;
237 port = 0;
238 while (p < end) {
239 if (*p < '0' || *p > '9') {
240 errno = EINVAL;
241 return -1;
242 }
243
244 port = port * 10 + *p - '0';
245 if (port > 65535) {
246 errno = EINVAL;
247 return -1;
248 }
249
250 p++;
251 }
252 } else {
253 /* empty port number */
254 port = -1;
255 }
256
257 c->host = strndup(host_start, (size_t) (host_end - host_start));
258 if (c->host == NULL) {
259 return -1; /* ENOMEM */
260 }
261 c->port = port;
262
263 return 0;
264 }
265
parse_url(const char * url)266 URL_COMPONENTS *parse_url(const char *url)
267 {
268 URL_COMPONENTS *c;
269 const char *p;
270 const char *end = url + strlen(url);
271 const char *found;
272 size_t len;
273
274 for (p = url ; p < end ; p++) {
275 if (is_control(*p)) {
276 errno = EINVAL;
277 return NULL;
278 }
279 }
280
281 c = malloc(sizeof(URL_COMPONENTS));
282 if (!c) {
283 return NULL;
284 }
285 memset(c, 0, sizeof(URL_COMPONENTS));
286 c->port = -1;
287
288 p = url;
289
290 /* lookup scheme */
291 found = lookup_scheme(p);
292 if (found) {
293 c->scheme = strndup(url, (size_t) (found - p));
294 if (c->scheme == NULL) {
295 goto error;
296 }
297 p = found + 1; /* skip a colon */
298 if (p >= end) {
299 return c;
300 }
301 }
302
303 if (strlen(p) >= 2 &&
304 p[0] == '/' && p[1] == '/') {
305 /* authority */
306 p = p + 2;
307 found = find_chars(p, strlen(p), "/?#");
308 if (found == NULL) {
309 len = strlen(p);
310 } else {
311 len = (size_t) (found - p);
312 }
313 if (parse_authority(p, len, c) == -1) {
314 goto error; /* ENOMEM,EINVAL */
315 }
316
317 if (!found) {
318 return c;
319 }
320
321 p = found;
322 }
323
324 if (*p != '?' && *p != '#') {
325 /* path */
326 found = find_chars(p, strlen(p), "?#");
327 found = NULL;
328 if (found == NULL) {
329 c->path = strdup(p);
330 if (c->path == NULL) {
331 goto error;
332 }
333 } else
334 {
335 if (found != p) {
336 c->path = strndup(p, (size_t) (found - p));
337 if (c->path == NULL) {
338 goto error;
339 }
340 }
341 }
342
343 if (!found) {
344 return c;
345 }
346
347 p = found;
348 }
349
350 /* ASSERT: *p is '?' or '#' */
351 #if 0
352 if (*p == '?') {
353 /* query */
354 p = p + 1;
355 found = find_chars(p, strlen(p), "#");
356 if (found == NULL) {
357 c->query = strdup(p);
358 } else {
359 c->query = strndup(p, (size_t) (found - p));
360 }
361
362 if (c->query == NULL) {
363 goto error;
364 }
365
366 if (!found) {
367 return c;
368 }
369
370 p = found;
371 }
372 #endif
373
374 /* ASSERT: *p is '#' */
375
376 /* fragment */
377 p = p + 1;
378 c->fragment = strdup(p);
379 if (c->fragment == NULL) {
380 goto error;
381 }
382
383 return c;
384
385 error:
386 free(c);
387
388 return NULL;
389 }
390
free_url_components(URL_COMPONENTS * c)391 void free_url_components(URL_COMPONENTS *c)
392 {
393 if (c->scheme) {
394 free(c->scheme);
395 }
396 if (c->user) {
397 free(c->user);
398 }
399 if (c->password) {
400 free(c->password);
401 }
402 if (c->host) {
403 free(c->host);
404 }
405 if (c->path) {
406 free(c->path);
407 }
408 if (c->query) {
409 free(c->query);
410 }
411 if (c->fragment) {
412 free(c->fragment);
413 }
414 free(c);
415 }
416
417