• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Web crawler based on curl and libxml2.
9  * Copyright (C) 2018 - 2020 Jeroen Ooms <jeroenooms@gmail.com>
10  * License: MIT
11  *
12  * To compile:
13  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
14  *
15  */
16 /* <DESC>
17  * Web crawler based on curl and libxml2 to stress-test curl with
18  * hundreds of concurrent connections to various servers.
19  * </DESC>
20  */
21 
22 /* Parameters */
23 int max_con = 200;
24 int max_total = 20000;
25 int max_requests = 500;
26 int max_link_per_page = 5;
27 int follow_relative_links = 0;
28 char *start_page = "https://www.reuters.com";
29 
30 #include <libxml/HTMLparser.h>
31 #include <libxml/xpath.h>
32 #include <libxml/uri.h>
33 #include <curl/curl.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <math.h>
37 #include <signal.h>
38 
39 int pending_interrupt = 0;
sighandler(int dummy)40 void sighandler(int dummy)
41 {
42   pending_interrupt = 1;
43 }
44 
45 /* resizable buffer */
46 typedef struct {
47   char *buf;
48   size_t size;
49 } memory;
50 
grow_buffer(void * contents,size_t sz,size_t nmemb,void * ctx)51 size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
52 {
53   size_t realsize = sz * nmemb;
54   memory *mem = (memory*) ctx;
55   char *ptr = realloc(mem->buf, mem->size + realsize);
56   if(!ptr) {
57     /* out of memory */
58     printf("not enough memory (realloc returned NULL)\n");
59     return 0;
60   }
61   mem->buf = ptr;
62   memcpy(&(mem->buf[mem->size]), contents, realsize);
63   mem->size += realsize;
64   return realsize;
65 }
66 
make_handle(char * url)67 CURL *make_handle(char *url)
68 {
69   CURL *handle = curl_easy_init();
70 
71   /* Important: use HTTP2 over HTTPS */
72   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
73   curl_easy_setopt(handle, CURLOPT_URL, url);
74 
75   /* buffer body */
76   memory *mem = malloc(sizeof(memory));
77   mem->size = 0;
78   mem->buf = malloc(1);
79   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
80   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
81   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
82 
83   /* For completeness */
84   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
85   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
86   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
87   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
88   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
89   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
90   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
91   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
92   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
93   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
94   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
95   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
96   return handle;
97 }
98 
99 /* HREF finder implemented in libxml2 but could be any HTML parser */
follow_links(CURLM * multi_handle,memory * mem,char * url)100 size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
101 {
102   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
103              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
104   htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
105   if(!doc)
106     return 0;
107   xmlChar *xpath = (xmlChar*) "//a/@href";
108   xmlXPathContextPtr context = xmlXPathNewContext(doc);
109   xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
110   xmlXPathFreeContext(context);
111   if(!result)
112     return 0;
113   xmlNodeSetPtr nodeset = result->nodesetval;
114   if(xmlXPathNodeSetIsEmpty(nodeset)) {
115     xmlXPathFreeObject(result);
116     return 0;
117   }
118   size_t count = 0;
119   int i;
120   for(i = 0; i < nodeset->nodeNr; i++) {
121     double r = rand();
122     int x = r * nodeset->nodeNr / RAND_MAX;
123     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
124     xmlChar *href = xmlNodeListGetString(doc, node, 1);
125     if(follow_relative_links) {
126       xmlChar *orig = href;
127       href = xmlBuildURI(href, (xmlChar *) url);
128       xmlFree(orig);
129     }
130     char *link = (char *) href;
131     if(!link || strlen(link) < 20)
132       continue;
133     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
134       curl_multi_add_handle(multi_handle, make_handle(link));
135       if(count++ == max_link_per_page)
136         break;
137     }
138     xmlFree(link);
139   }
140   xmlXPathFreeObject(result);
141   return count;
142 }
143 
is_html(char * ctype)144 int is_html(char *ctype)
145 {
146   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
147 }
148 
main(void)149 int main(void)
150 {
151   signal(SIGINT, sighandler);
152   LIBXML_TEST_VERSION;
153   curl_global_init(CURL_GLOBAL_DEFAULT);
154   CURLM *multi_handle = curl_multi_init();
155   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
156   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
157 
158   /* enables http/2 if available */
159 #ifdef CURLPIPE_MULTIPLEX
160   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
161 #endif
162 
163   /* sets html start page */
164   curl_multi_add_handle(multi_handle, make_handle(start_page));
165 
166   int msgs_left;
167   int pending = 0;
168   int complete = 0;
169   int still_running = 1;
170   while(still_running && !pending_interrupt) {
171     int numfds;
172     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
173     curl_multi_perform(multi_handle, &still_running);
174 
175     /* See how the transfers went */
176     CURLMsg *m = NULL;
177     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
178       if(m->msg == CURLMSG_DONE) {
179         CURL *handle = m->easy_handle;
180         char *url;
181         memory *mem;
182         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
183         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
184         if(m->data.result == CURLE_OK) {
185           long res_status;
186           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
187           if(res_status == 200) {
188             char *ctype;
189             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
190             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
191             if(is_html(ctype) && mem->size > 100) {
192               if(pending < max_requests && (complete + pending) < max_total) {
193                 pending += follow_links(multi_handle, mem, url);
194                 still_running = 1;
195               }
196             }
197           }
198           else {
199             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
200           }
201         }
202         else {
203           printf("[%d] Connection failure: %s\n", complete, url);
204         }
205         curl_multi_remove_handle(multi_handle, handle);
206         curl_easy_cleanup(handle);
207         free(mem->buf);
208         free(mem);
209         complete++;
210         pending--;
211       }
212     }
213   }
214   curl_multi_cleanup(multi_handle);
215   curl_global_cleanup();
216   return 0;
217 }
218