1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2015, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.haxx.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22 /* <DESC>
23 * Get a web page, extract the title with libxml.
24 * </DESC>
25 */
26 // Written by Lars Nilsson
27 //
28 // GNU C++ compile command line suggestion (edit paths accordingly):
29 //
30 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
31 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
32
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <string>
37 #include <curl/curl.h>
38 #include <libxml/HTMLparser.h>
39
40 //
41 // Case-insensitive string comparison
42 //
43
44 #ifdef _MSC_VER
45 #define COMPARE(a, b) (!_stricmp((a), (b)))
46 #else
47 #define COMPARE(a, b) (!strcasecmp((a), (b)))
48 #endif
49
50 //
51 // libxml callback context structure
52 //
53
54 struct Context
55 {
ContextContext56 Context(): addTitle(false) { }
57
58 bool addTitle;
59 std::string title;
60 };
61
62 //
63 // libcurl variables for error strings and returned data
64
65 static char errorBuffer[CURL_ERROR_SIZE];
66 static std::string buffer;
67
68 //
69 // libcurl write callback function
70 //
71
writer(char * data,size_t size,size_t nmemb,std::string * writerData)72 static int writer(char *data, size_t size, size_t nmemb,
73 std::string *writerData)
74 {
75 if (writerData == NULL)
76 return 0;
77
78 writerData->append(data, size*nmemb);
79
80 return size * nmemb;
81 }
82
83 //
84 // libcurl connection initialization
85 //
86
init(CURL * & conn,char * url)87 static bool init(CURL *&conn, char *url)
88 {
89 CURLcode code;
90
91 conn = curl_easy_init();
92
93 if (conn == NULL)
94 {
95 fprintf(stderr, "Failed to create CURL connection\n");
96
97 exit(EXIT_FAILURE);
98 }
99
100 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
101 if (code != CURLE_OK)
102 {
103 fprintf(stderr, "Failed to set error buffer [%d]\n", code);
104
105 return false;
106 }
107
108 code = curl_easy_setopt(conn, CURLOPT_URL, url);
109 if (code != CURLE_OK)
110 {
111 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
112
113 return false;
114 }
115
116 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
117 if (code != CURLE_OK)
118 {
119 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
120
121 return false;
122 }
123
124 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
125 if (code != CURLE_OK)
126 {
127 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
128
129 return false;
130 }
131
132 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
133 if (code != CURLE_OK)
134 {
135 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
136
137 return false;
138 }
139
140 return true;
141 }
142
143 //
144 // libxml start element callback function
145 //
146
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)147 static void StartElement(void *voidContext,
148 const xmlChar *name,
149 const xmlChar **attributes)
150 {
151 Context *context = (Context *)voidContext;
152
153 if (COMPARE((char *)name, "TITLE"))
154 {
155 context->title = "";
156 context->addTitle = true;
157 }
158 (void) attributes;
159 }
160
161 //
162 // libxml end element callback function
163 //
164
EndElement(void * voidContext,const xmlChar * name)165 static void EndElement(void *voidContext,
166 const xmlChar *name)
167 {
168 Context *context = (Context *)voidContext;
169
170 if (COMPARE((char *)name, "TITLE"))
171 context->addTitle = false;
172 }
173
174 //
175 // Text handling helper function
176 //
177
handleCharacters(Context * context,const xmlChar * chars,int length)178 static void handleCharacters(Context *context,
179 const xmlChar *chars,
180 int length)
181 {
182 if (context->addTitle)
183 context->title.append((char *)chars, length);
184 }
185
186 //
187 // libxml PCDATA callback function
188 //
189
Characters(void * voidContext,const xmlChar * chars,int length)190 static void Characters(void *voidContext,
191 const xmlChar *chars,
192 int length)
193 {
194 Context *context = (Context *)voidContext;
195
196 handleCharacters(context, chars, length);
197 }
198
199 //
200 // libxml CDATA callback function
201 //
202
cdata(void * voidContext,const xmlChar * chars,int length)203 static void cdata(void *voidContext,
204 const xmlChar *chars,
205 int length)
206 {
207 Context *context = (Context *)voidContext;
208
209 handleCharacters(context, chars, length);
210 }
211
212 //
213 // libxml SAX callback structure
214 //
215
216 static htmlSAXHandler saxHandler =
217 {
218 NULL,
219 NULL,
220 NULL,
221 NULL,
222 NULL,
223 NULL,
224 NULL,
225 NULL,
226 NULL,
227 NULL,
228 NULL,
229 NULL,
230 NULL,
231 NULL,
232 StartElement,
233 EndElement,
234 NULL,
235 Characters,
236 NULL,
237 NULL,
238 NULL,
239 NULL,
240 NULL,
241 NULL,
242 NULL,
243 cdata,
244 NULL
245 };
246
247 //
248 // Parse given (assumed to be) HTML text and return the title
249 //
250
parseHtml(const std::string & html,std::string & title)251 static void parseHtml(const std::string &html,
252 std::string &title)
253 {
254 htmlParserCtxtPtr ctxt;
255 Context context;
256
257 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
258 XML_CHAR_ENCODING_NONE);
259
260 htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
261 htmlParseChunk(ctxt, "", 0, 1);
262
263 htmlFreeParserCtxt(ctxt);
264
265 title = context.title;
266 }
267
main(int argc,char * argv[])268 int main(int argc, char *argv[])
269 {
270 CURL *conn = NULL;
271 CURLcode code;
272 std::string title;
273
274 // Ensure one argument is given
275
276 if (argc != 2)
277 {
278 fprintf(stderr, "Usage: %s <url>\n", argv[0]);
279
280 exit(EXIT_FAILURE);
281 }
282
283 curl_global_init(CURL_GLOBAL_DEFAULT);
284
285 // Initialize CURL connection
286
287 if (!init(conn, argv[1]))
288 {
289 fprintf(stderr, "Connection initializion failed\n");
290
291 exit(EXIT_FAILURE);
292 }
293
294 // Retrieve content for the URL
295
296 code = curl_easy_perform(conn);
297 curl_easy_cleanup(conn);
298
299 if (code != CURLE_OK)
300 {
301 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
302
303 exit(EXIT_FAILURE);
304 }
305
306 // Parse the (assumed) HTML code
307
308 parseHtml(buffer, title);
309
310 // Display the extracted title
311
312 printf("Title: %s\n", title.c_str());
313
314 return EXIT_SUCCESS;
315 }
316