1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2020, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22 /* <DESC>
23 * Get a web page, extract the title with libxml.
24 * </DESC>
25
26 Written by Lars Nilsson
27
28 GNU C++ compile command line suggestion (edit paths accordingly):
29
30 g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
31 -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
32 */
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <string>
37 #include <curl/curl.h>
38 #include <libxml/HTMLparser.h>
39
40 //
41 // Case-insensitive string comparison
42 //
43
44 #ifdef _MSC_VER
45 #define COMPARE(a, b) (!_stricmp((a), (b)))
46 #else
47 #define COMPARE(a, b) (!strcasecmp((a), (b)))
48 #endif
49
50 //
51 // libxml callback context structure
52 //
53
54 struct Context
55 {
ContextContext56 Context(): addTitle(false) { }
57
58 bool addTitle;
59 std::string title;
60 };
61
62 //
63 // libcurl variables for error strings and returned data
64
65 static char errorBuffer[CURL_ERROR_SIZE];
66 static std::string buffer;
67
68 //
69 // libcurl write callback function
70 //
71
writer(char * data,size_t size,size_t nmemb,std::string * writerData)72 static int writer(char *data, size_t size, size_t nmemb,
73 std::string *writerData)
74 {
75 if(writerData == NULL)
76 return 0;
77
78 writerData->append(data, size*nmemb);
79
80 return size * nmemb;
81 }
82
83 //
84 // libcurl connection initialization
85 //
86
init(CURL * & conn,char * url)87 static bool init(CURL *&conn, char *url)
88 {
89 CURLcode code;
90
91 conn = curl_easy_init();
92
93 if(conn == NULL) {
94 fprintf(stderr, "Failed to create CURL connection\n");
95 exit(EXIT_FAILURE);
96 }
97
98 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
99 if(code != CURLE_OK) {
100 fprintf(stderr, "Failed to set error buffer [%d]\n", code);
101 return false;
102 }
103
104 code = curl_easy_setopt(conn, CURLOPT_URL, url);
105 if(code != CURLE_OK) {
106 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
107 return false;
108 }
109
110 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
111 if(code != CURLE_OK) {
112 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
113 return false;
114 }
115
116 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
117 if(code != CURLE_OK) {
118 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
119 return false;
120 }
121
122 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
123 if(code != CURLE_OK) {
124 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
125 return false;
126 }
127
128 return true;
129 }
130
131 //
132 // libxml start element callback function
133 //
134
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)135 static void StartElement(void *voidContext,
136 const xmlChar *name,
137 const xmlChar **attributes)
138 {
139 Context *context = static_cast<Context *>(voidContext);
140
141 if(COMPARE(reinterpret_cast<char *>(name), "TITLE")) {
142 context->title = "";
143 context->addTitle = true;
144 }
145 (void) attributes;
146 }
147
148 //
149 // libxml end element callback function
150 //
151
EndElement(void * voidContext,const xmlChar * name)152 static void EndElement(void *voidContext,
153 const xmlChar *name)
154 {
155 Context *context = static_cast<Context *>(voidContext);
156
157 if(COMPARE(reinterpret_cast<char *>(name), "TITLE"))
158 context->addTitle = false;
159 }
160
161 //
162 // Text handling helper function
163 //
164
handleCharacters(Context * context,const xmlChar * chars,int length)165 static void handleCharacters(Context *context,
166 const xmlChar *chars,
167 int length)
168 {
169 if(context->addTitle)
170 context->title.append(reinterpret_cast<char *>(chars), length);
171 }
172
173 //
174 // libxml PCDATA callback function
175 //
176
Characters(void * voidContext,const xmlChar * chars,int length)177 static void Characters(void *voidContext,
178 const xmlChar *chars,
179 int length)
180 {
181 Context *context = static_cast<Context *>(voidContext);
182
183 handleCharacters(context, chars, length);
184 }
185
186 //
187 // libxml CDATA callback function
188 //
189
cdata(void * voidContext,const xmlChar * chars,int length)190 static void cdata(void *voidContext,
191 const xmlChar *chars,
192 int length)
193 {
194 Context *context = static_cast<Context *>(voidContext);
195
196 handleCharacters(context, chars, length);
197 }
198
199 //
200 // libxml SAX callback structure
201 //
202
203 static htmlSAXHandler saxHandler =
204 {
205 NULL,
206 NULL,
207 NULL,
208 NULL,
209 NULL,
210 NULL,
211 NULL,
212 NULL,
213 NULL,
214 NULL,
215 NULL,
216 NULL,
217 NULL,
218 NULL,
219 StartElement,
220 EndElement,
221 NULL,
222 Characters,
223 NULL,
224 NULL,
225 NULL,
226 NULL,
227 NULL,
228 NULL,
229 NULL,
230 cdata,
231 NULL
232 };
233
234 //
235 // Parse given (assumed to be) HTML text and return the title
236 //
237
parseHtml(const std::string & html,std::string & title)238 static void parseHtml(const std::string &html,
239 std::string &title)
240 {
241 htmlParserCtxtPtr ctxt;
242 Context context;
243
244 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
245 XML_CHAR_ENCODING_NONE);
246
247 htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
248 htmlParseChunk(ctxt, "", 0, 1);
249
250 htmlFreeParserCtxt(ctxt);
251
252 title = context.title;
253 }
254
main(int argc,char * argv[])255 int main(int argc, char *argv[])
256 {
257 CURL *conn = NULL;
258 CURLcode code;
259 std::string title;
260
261 // Ensure one argument is given
262
263 if(argc != 2) {
264 fprintf(stderr, "Usage: %s <url>\n", argv[0]);
265 exit(EXIT_FAILURE);
266 }
267
268 curl_global_init(CURL_GLOBAL_DEFAULT);
269
270 // Initialize CURL connection
271
272 if(!init(conn, argv[1])) {
273 fprintf(stderr, "Connection initializion failed\n");
274 exit(EXIT_FAILURE);
275 }
276
277 // Retrieve content for the URL
278
279 code = curl_easy_perform(conn);
280 curl_easy_cleanup(conn);
281
282 if(code != CURLE_OK) {
283 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
284 exit(EXIT_FAILURE);
285 }
286
287 // Parse the (assumed) HTML code
288 parseHtml(buffer, title);
289
290 // Display the extracted title
291 printf("Title: %s\n", title.c_str());
292
293 return EXIT_SUCCESS;
294 }
295