1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at http://curl.haxx.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22 // Get a web page, parse it with libxml.
23 //
24 // Written by Lars Nilsson
25 //
26 // GNU C++ compile command line suggestion (edit paths accordingly):
27 //
28 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
29 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
30
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <string>
35 #include <curl/curl.h>
36 #include <libxml/HTMLparser.h>
37
38 //
39 // Case-insensitive string comparison
40 //
41
42 #ifdef _MSC_VER
43 #define COMPARE(a, b) (!stricmp((a), (b)))
44 #else
45 #define COMPARE(a, b) (!strcasecmp((a), (b)))
46 #endif
47
48 //
49 // libxml callback context structure
50 //
51
52 struct Context
53 {
ContextContext54 Context(): addTitle(false) { }
55
56 bool addTitle;
57 std::string title;
58 };
59
60 //
61 // libcurl variables for error strings and returned data
62
63 static char errorBuffer[CURL_ERROR_SIZE];
64 static std::string buffer;
65
66 //
67 // libcurl write callback function
68 //
69
writer(char * data,size_t size,size_t nmemb,std::string * writerData)70 static int writer(char *data, size_t size, size_t nmemb,
71 std::string *writerData)
72 {
73 if (writerData == NULL)
74 return 0;
75
76 writerData->append(data, size*nmemb);
77
78 return size * nmemb;
79 }
80
81 //
82 // libcurl connection initialization
83 //
84
init(CURL * & conn,char * url)85 static bool init(CURL *&conn, char *url)
86 {
87 CURLcode code;
88
89 conn = curl_easy_init();
90
91 if (conn == NULL)
92 {
93 fprintf(stderr, "Failed to create CURL connection\n");
94
95 exit(EXIT_FAILURE);
96 }
97
98 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
99 if (code != CURLE_OK)
100 {
101 fprintf(stderr, "Failed to set error buffer [%d]\n", code);
102
103 return false;
104 }
105
106 code = curl_easy_setopt(conn, CURLOPT_URL, url);
107 if (code != CURLE_OK)
108 {
109 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
110
111 return false;
112 }
113
114 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
115 if (code != CURLE_OK)
116 {
117 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
118
119 return false;
120 }
121
122 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
123 if (code != CURLE_OK)
124 {
125 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
126
127 return false;
128 }
129
130 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
131 if (code != CURLE_OK)
132 {
133 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
134
135 return false;
136 }
137
138 return true;
139 }
140
141 //
142 // libxml start element callback function
143 //
144
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)145 static void StartElement(void *voidContext,
146 const xmlChar *name,
147 const xmlChar **attributes)
148 {
149 Context *context = (Context *)voidContext;
150
151 if (COMPARE((char *)name, "TITLE"))
152 {
153 context->title = "";
154 context->addTitle = true;
155 }
156 (void) attributes;
157 }
158
159 //
160 // libxml end element callback function
161 //
162
EndElement(void * voidContext,const xmlChar * name)163 static void EndElement(void *voidContext,
164 const xmlChar *name)
165 {
166 Context *context = (Context *)voidContext;
167
168 if (COMPARE((char *)name, "TITLE"))
169 context->addTitle = false;
170 }
171
172 //
173 // Text handling helper function
174 //
175
handleCharacters(Context * context,const xmlChar * chars,int length)176 static void handleCharacters(Context *context,
177 const xmlChar *chars,
178 int length)
179 {
180 if (context->addTitle)
181 context->title.append((char *)chars, length);
182 }
183
184 //
185 // libxml PCDATA callback function
186 //
187
Characters(void * voidContext,const xmlChar * chars,int length)188 static void Characters(void *voidContext,
189 const xmlChar *chars,
190 int length)
191 {
192 Context *context = (Context *)voidContext;
193
194 handleCharacters(context, chars, length);
195 }
196
197 //
198 // libxml CDATA callback function
199 //
200
cdata(void * voidContext,const xmlChar * chars,int length)201 static void cdata(void *voidContext,
202 const xmlChar *chars,
203 int length)
204 {
205 Context *context = (Context *)voidContext;
206
207 handleCharacters(context, chars, length);
208 }
209
210 //
211 // libxml SAX callback structure
212 //
213
214 static htmlSAXHandler saxHandler =
215 {
216 NULL,
217 NULL,
218 NULL,
219 NULL,
220 NULL,
221 NULL,
222 NULL,
223 NULL,
224 NULL,
225 NULL,
226 NULL,
227 NULL,
228 NULL,
229 NULL,
230 StartElement,
231 EndElement,
232 NULL,
233 Characters,
234 NULL,
235 NULL,
236 NULL,
237 NULL,
238 NULL,
239 NULL,
240 NULL,
241 cdata,
242 NULL
243 };
244
245 //
246 // Parse given (assumed to be) HTML text and return the title
247 //
248
parseHtml(const std::string & html,std::string & title)249 static void parseHtml(const std::string &html,
250 std::string &title)
251 {
252 htmlParserCtxtPtr ctxt;
253 Context context;
254
255 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
256 XML_CHAR_ENCODING_NONE);
257
258 htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
259 htmlParseChunk(ctxt, "", 0, 1);
260
261 htmlFreeParserCtxt(ctxt);
262
263 title = context.title;
264 }
265
main(int argc,char * argv[])266 int main(int argc, char *argv[])
267 {
268 CURL *conn = NULL;
269 CURLcode code;
270 std::string title;
271
272 // Ensure one argument is given
273
274 if (argc != 2)
275 {
276 fprintf(stderr, "Usage: %s <url>\n", argv[0]);
277
278 exit(EXIT_FAILURE);
279 }
280
281 curl_global_init(CURL_GLOBAL_DEFAULT);
282
283 // Initialize CURL connection
284
285 if (!init(conn, argv[1]))
286 {
287 fprintf(stderr, "Connection initializion failed\n");
288
289 exit(EXIT_FAILURE);
290 }
291
292 // Retrieve content for the URL
293
294 code = curl_easy_perform(conn);
295 curl_easy_cleanup(conn);
296
297 if (code != CURLE_OK)
298 {
299 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
300
301 exit(EXIT_FAILURE);
302 }
303
304 // Parse the (assumed) HTML code
305
306 parseHtml(buffer, title);
307
308 // Display the extracted title
309
310 printf("Title: %s\n", title.c_str());
311
312 return EXIT_SUCCESS;
313 }
314