• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) 1998 - 2015, Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.haxx.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  ***************************************************************************/
22 /* <DESC>
23  * Get a web page, extract the title with libxml.
24  * </DESC>
25  */
26 // Written by Lars Nilsson
27 //
28 // GNU C++ compile command line suggestion (edit paths accordingly):
29 //
30 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
31 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
32 
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <string>
37 #include <curl/curl.h>
38 #include <libxml/HTMLparser.h>
39 
40 //
41 //  Case-insensitive string comparison
42 //
43 
44 #ifdef _MSC_VER
45 #define COMPARE(a, b) (!_stricmp((a), (b)))
46 #else
47 #define COMPARE(a, b) (!strcasecmp((a), (b)))
48 #endif
49 
50 //
51 //  libxml callback context structure
52 //
53 
54 struct Context
55 {
ContextContext56   Context(): addTitle(false) { }
57 
58   bool addTitle;
59   std::string title;
60 };
61 
62 //
63 //  libcurl variables for error strings and returned data
64 
65 static char errorBuffer[CURL_ERROR_SIZE];
66 static std::string buffer;
67 
68 //
69 //  libcurl write callback function
70 //
71 
writer(char * data,size_t size,size_t nmemb,std::string * writerData)72 static int writer(char *data, size_t size, size_t nmemb,
73                   std::string *writerData)
74 {
75   if (writerData == NULL)
76     return 0;
77 
78   writerData->append(data, size*nmemb);
79 
80   return size * nmemb;
81 }
82 
83 //
84 //  libcurl connection initialization
85 //
86 
init(CURL * & conn,char * url)87 static bool init(CURL *&conn, char *url)
88 {
89   CURLcode code;
90 
91   conn = curl_easy_init();
92 
93   if (conn == NULL)
94   {
95     fprintf(stderr, "Failed to create CURL connection\n");
96 
97     exit(EXIT_FAILURE);
98   }
99 
100   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
101   if (code != CURLE_OK)
102   {
103     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
104 
105     return false;
106   }
107 
108   code = curl_easy_setopt(conn, CURLOPT_URL, url);
109   if (code != CURLE_OK)
110   {
111     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
112 
113     return false;
114   }
115 
116   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
117   if (code != CURLE_OK)
118   {
119     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
120 
121     return false;
122   }
123 
124   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
125   if (code != CURLE_OK)
126   {
127     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
128 
129     return false;
130   }
131 
132   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
133   if (code != CURLE_OK)
134   {
135     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
136 
137     return false;
138   }
139 
140   return true;
141 }
142 
143 //
144 //  libxml start element callback function
145 //
146 
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)147 static void StartElement(void *voidContext,
148                          const xmlChar *name,
149                          const xmlChar **attributes)
150 {
151   Context *context = (Context *)voidContext;
152 
153   if (COMPARE((char *)name, "TITLE"))
154   {
155     context->title = "";
156     context->addTitle = true;
157   }
158   (void) attributes;
159 }
160 
161 //
162 //  libxml end element callback function
163 //
164 
EndElement(void * voidContext,const xmlChar * name)165 static void EndElement(void *voidContext,
166                        const xmlChar *name)
167 {
168   Context *context = (Context *)voidContext;
169 
170   if (COMPARE((char *)name, "TITLE"))
171     context->addTitle = false;
172 }
173 
174 //
175 //  Text handling helper function
176 //
177 
handleCharacters(Context * context,const xmlChar * chars,int length)178 static void handleCharacters(Context *context,
179                              const xmlChar *chars,
180                              int length)
181 {
182   if (context->addTitle)
183     context->title.append((char *)chars, length);
184 }
185 
186 //
187 //  libxml PCDATA callback function
188 //
189 
Characters(void * voidContext,const xmlChar * chars,int length)190 static void Characters(void *voidContext,
191                        const xmlChar *chars,
192                        int length)
193 {
194   Context *context = (Context *)voidContext;
195 
196   handleCharacters(context, chars, length);
197 }
198 
199 //
200 //  libxml CDATA callback function
201 //
202 
cdata(void * voidContext,const xmlChar * chars,int length)203 static void cdata(void *voidContext,
204                   const xmlChar *chars,
205                   int length)
206 {
207   Context *context = (Context *)voidContext;
208 
209   handleCharacters(context, chars, length);
210 }
211 
212 //
213 //  libxml SAX callback structure
214 //
215 
216 static htmlSAXHandler saxHandler =
217 {
218   NULL,
219   NULL,
220   NULL,
221   NULL,
222   NULL,
223   NULL,
224   NULL,
225   NULL,
226   NULL,
227   NULL,
228   NULL,
229   NULL,
230   NULL,
231   NULL,
232   StartElement,
233   EndElement,
234   NULL,
235   Characters,
236   NULL,
237   NULL,
238   NULL,
239   NULL,
240   NULL,
241   NULL,
242   NULL,
243   cdata,
244   NULL
245 };
246 
247 //
248 //  Parse given (assumed to be) HTML text and return the title
249 //
250 
parseHtml(const std::string & html,std::string & title)251 static void parseHtml(const std::string &html,
252                       std::string &title)
253 {
254   htmlParserCtxtPtr ctxt;
255   Context context;
256 
257   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
258                                   XML_CHAR_ENCODING_NONE);
259 
260   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
261   htmlParseChunk(ctxt, "", 0, 1);
262 
263   htmlFreeParserCtxt(ctxt);
264 
265   title = context.title;
266 }
267 
main(int argc,char * argv[])268 int main(int argc, char *argv[])
269 {
270   CURL *conn = NULL;
271   CURLcode code;
272   std::string title;
273 
274   // Ensure one argument is given
275 
276   if (argc != 2)
277   {
278     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
279 
280     exit(EXIT_FAILURE);
281   }
282 
283   curl_global_init(CURL_GLOBAL_DEFAULT);
284 
285   // Initialize CURL connection
286 
287   if (!init(conn, argv[1]))
288   {
289     fprintf(stderr, "Connection initializion failed\n");
290 
291     exit(EXIT_FAILURE);
292   }
293 
294   // Retrieve content for the URL
295 
296   code = curl_easy_perform(conn);
297   curl_easy_cleanup(conn);
298 
299   if (code != CURLE_OK)
300   {
301     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
302 
303     exit(EXIT_FAILURE);
304   }
305 
306   // Parse the (assumed) HTML code
307 
308   parseHtml(buffer, title);
309 
310   // Display the extracted title
311 
312   printf("Title: %s\n", title.c_str());
313 
314   return EXIT_SUCCESS;
315 }
316