1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Web crawler based on curl and libxml2.
9 * Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>
10 * License: MIT
11 *
12 * To compile:
13 * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
14 *
15 */
16 /* <DESC>
17 * Web crawler based on curl and libxml2 to stress-test curl with
18 * hundreds of concurrent connections to various servers.
19 * </DESC>
20 */
21
22 /* Parameters */
23 int max_con = 200;
24 int max_total = 20000;
25 int max_requests = 500;
26 int max_link_per_page = 5;
27 int follow_relative_links = 0;
28 char *start_page = "https://www.reuters.com";
29
30 #include <libxml/HTMLparser.h>
31 #include <libxml/xpath.h>
32 #include <libxml/uri.h>
33 #include <curl/curl.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <math.h>
37 #include <signal.h>
38
39 int pending_interrupt = 0;
sighandler(int dummy)40 void sighandler(int dummy)
41 {
42 pending_interrupt = 1;
43 }
44
45 /* resizable buffer */
46 typedef struct {
47 char *buf;
48 size_t size;
49 } memory;
50
grow_buffer(void * contents,size_t sz,size_t nmemb,void * ctx)51 size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
52 {
53 size_t realsize = sz * nmemb;
54 memory *mem = (memory*) ctx;
55 char *ptr = realloc(mem->buf, mem->size + realsize);
56 if(!ptr) {
57 /* out of memory */
58 printf("not enough memory (realloc returned NULL)\n");
59 return 0;
60 }
61 mem->buf = ptr;
62 memcpy(&(mem->buf[mem->size]), contents, realsize);
63 mem->size += realsize;
64 return realsize;
65 }
66
make_handle(char * url)67 CURL *make_handle(char *url)
68 {
69 CURL *handle = curl_easy_init();
70
71 /* Important: use HTTP2 over HTTPS */
72 curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
73 curl_easy_setopt(handle, CURLOPT_URL, url);
74
75 /* buffer body */
76 memory *mem = malloc(sizeof(memory));
77 mem->size = 0;
78 mem->buf = malloc(1);
79 curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
80 curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
81 curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
82
83 /* For completeness */
84 curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
85 curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
86 curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
87 curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
88 curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
89 curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
90 curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
91 curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
92 curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
93 curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
94 curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
95 curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
96 return handle;
97 }
98
99 /* HREF finder implemented in libxml2 but could be any HTML parser */
follow_links(CURLM * multi_handle,memory * mem,char * url)100 size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
101 {
102 int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
103 HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
104 htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
105 if(!doc)
106 return 0;
107 xmlChar *xpath = (xmlChar*) "//a/@href";
108 xmlXPathContextPtr context = xmlXPathNewContext(doc);
109 xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
110 xmlXPathFreeContext(context);
111 if(!result)
112 return 0;
113 xmlNodeSetPtr nodeset = result->nodesetval;
114 if(xmlXPathNodeSetIsEmpty(nodeset)) {
115 xmlXPathFreeObject(result);
116 return 0;
117 }
118 size_t count = 0;
119 for(int i = 0; i < nodeset->nodeNr; i++) {
120 double r = rand();
121 int x = r * nodeset->nodeNr / RAND_MAX;
122 const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
123 xmlChar *href = xmlNodeListGetString(doc, node, 1);
124 if(follow_relative_links) {
125 xmlChar *orig = href;
126 href = xmlBuildURI(href, (xmlChar *) url);
127 xmlFree(orig);
128 }
129 char *link = (char *) href;
130 if(!link || strlen(link) < 20)
131 continue;
132 if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
133 curl_multi_add_handle(multi_handle, make_handle(link));
134 if(count++ == max_link_per_page)
135 break;
136 }
137 xmlFree(link);
138 }
139 xmlXPathFreeObject(result);
140 return count;
141 }
142
is_html(char * ctype)143 int is_html(char *ctype)
144 {
145 return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
146 }
147
main(void)148 int main(void)
149 {
150 signal(SIGINT, sighandler);
151 LIBXML_TEST_VERSION;
152 curl_global_init(CURL_GLOBAL_DEFAULT);
153 CURLM *multi_handle = curl_multi_init();
154 curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
155 curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
156
157 /* enables http/2 if available */
158 #ifdef CURLPIPE_MULTIPLEX
159 curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
160 #endif
161
162 /* sets html start page */
163 curl_multi_add_handle(multi_handle, make_handle(start_page));
164
165 int msgs_left;
166 int pending = 0;
167 int complete = 0;
168 int still_running = 1;
169 while(still_running && !pending_interrupt) {
170 int numfds;
171 curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
172 curl_multi_perform(multi_handle, &still_running);
173
174 /* See how the transfers went */
175 CURLMsg *m = NULL;
176 while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
177 if(m->msg == CURLMSG_DONE) {
178 CURL *handle = m->easy_handle;
179 char *url;
180 memory *mem;
181 curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
182 curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
183 if(m->data.result == CURLE_OK) {
184 long res_status;
185 curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
186 if(res_status == 200) {
187 char *ctype;
188 curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
189 printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
190 if(is_html(ctype) && mem->size > 100) {
191 if(pending < max_requests && (complete + pending) < max_total) {
192 pending += follow_links(multi_handle, mem, url);
193 still_running = 1;
194 }
195 }
196 }
197 else {
198 printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
199 }
200 }
201 else {
202 printf("[%d] Connection failure: %s\n", complete, url);
203 }
204 curl_multi_remove_handle(multi_handle, handle);
205 curl_easy_cleanup(handle);
206 free(mem->buf);
207 free(mem);
208 complete++;
209 pending--;
210 }
211 }
212 }
213 curl_multi_cleanup(multi_handle);
214 curl_global_cleanup();
215 return 0;
216 }
217