• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 
16 #ifdef HAVE_CTYPE_H
17 #include <ctype.h>
18 #endif
19 #ifdef HAVE_STDLIB_H
20 #include <stdlib.h>
21 #endif
22 
23 #include <libxml/xmlmemory.h>
24 #include <libxml/HTMLparser.h>
25 #include <libxml/HTMLtree.h>
26 #include <libxml/entities.h>
27 #include <libxml/valid.h>
28 #include <libxml/xmlerror.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/globals.h>
31 #include <libxml/uri.h>
32 
33 /************************************************************************
34  *									*
35  *   		Getting/Setting encoding meta tags			*
36  *									*
37  ************************************************************************/
38 
39 /**
40  * htmlGetMetaEncoding:
41  * @doc:  the document
42  *
43  * Encoding definition lookup in the Meta tags
44  *
45  * Returns the current encoding as flagged in the HTML source
46  */
47 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)48 htmlGetMetaEncoding(htmlDocPtr doc) {
49     htmlNodePtr cur;
50     const xmlChar *content;
51     const xmlChar *encoding;
52 
53     if (doc == NULL)
54 	return(NULL);
55     cur = doc->children;
56 
57     /*
58      * Search the html
59      */
60     while (cur != NULL) {
61 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
63 		break;
64 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
65 		goto found_head;
66 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67 		goto found_meta;
68 	}
69 	cur = cur->next;
70     }
71     if (cur == NULL)
72 	return(NULL);
73     cur = cur->children;
74 
75     /*
76      * Search the head
77      */
78     while (cur != NULL) {
79 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
81 		break;
82 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83 		goto found_meta;
84 	}
85 	cur = cur->next;
86     }
87     if (cur == NULL)
88 	return(NULL);
89 found_head:
90     cur = cur->children;
91 
92     /*
93      * Search the meta elements
94      */
95 found_meta:
96     while (cur != NULL) {
97 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99 		xmlAttrPtr attr = cur->properties;
100 		int http;
101 		const xmlChar *value;
102 
103 		content = NULL;
104 		http = 0;
105 		while (attr != NULL) {
106 		    if ((attr->children != NULL) &&
107 		        (attr->children->type == XML_TEXT_NODE) &&
108 		        (attr->children->next == NULL)) {
109 			value = attr->children->content;
110 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112 			    http = 1;
113 			else if ((value != NULL)
114 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115 			    content = value;
116 			if ((http != 0) && (content != NULL))
117 			    goto found_content;
118 		    }
119 		    attr = attr->next;
120 		}
121 	    }
122 	}
123 	cur = cur->next;
124     }
125     return(NULL);
126 
127 found_content:
128     encoding = xmlStrstr(content, BAD_CAST"charset=");
129     if (encoding == NULL)
130 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
131     if (encoding == NULL)
132 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133     if (encoding != NULL) {
134 	encoding += 8;
135     } else {
136 	encoding = xmlStrstr(content, BAD_CAST"charset =");
137 	if (encoding == NULL)
138 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
139 	if (encoding == NULL)
140 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141 	if (encoding != NULL)
142 	    encoding += 9;
143     }
144     if (encoding != NULL) {
145 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146     }
147     return(encoding);
148 }
149 
150 /**
151  * htmlSetMetaEncoding:
152  * @doc:  the document
153  * @encoding:  the encoding string
154  *
155  * Sets the current encoding in the Meta tags
156  * NOTE: this will not change the document content encoding, just
157  * the META flag associated.
158  *
159  * Returns 0 in case of success and -1 in case of error
160  */
161 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)162 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163     htmlNodePtr cur, meta;
164     const xmlChar *content;
165     char newcontent[100];
166 
167 
168     if (doc == NULL)
169 	return(-1);
170 
171     if (encoding != NULL) {
172 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
173                 (char *)encoding);
174 	newcontent[sizeof(newcontent) - 1] = 0;
175     }
176 
177     cur = doc->children;
178 
179     /*
180      * Search the html
181      */
182     while (cur != NULL) {
183 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
184 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
185 		break;
186 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
187 		goto found_head;
188 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
189 		goto found_meta;
190 	}
191 	cur = cur->next;
192     }
193     if (cur == NULL)
194 	return(-1);
195     cur = cur->children;
196 
197     /*
198      * Search the head
199      */
200     while (cur != NULL) {
201 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
202 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
203 		break;
204 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
205 		goto found_meta;
206 	}
207 	cur = cur->next;
208     }
209     if (cur == NULL)
210 	return(-1);
211 found_head:
212     if (cur->children == NULL) {
213 	if (encoding == NULL)
214 	    return(0);
215 	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
216 	xmlAddChild(cur, meta);
217 	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
218 	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
219 	return(0);
220     }
221     cur = cur->children;
222 
223 found_meta:
224     if (encoding != NULL) {
225 	/*
226 	 * Create a new Meta element with the right attributes
227 	 */
228 
229 	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
230 	xmlAddPrevSibling(cur, meta);
231 	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
232 	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
233     }
234 
235     /*
236      * Search and destroy all the remaining the meta elements carrying
237      * encoding informations
238      */
239     while (cur != NULL) {
240 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
241 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
242 		xmlAttrPtr attr = cur->properties;
243 		int http;
244 		const xmlChar *value;
245 
246 		content = NULL;
247 		http = 0;
248 		while (attr != NULL) {
249 		    if ((attr->children != NULL) &&
250 		        (attr->children->type == XML_TEXT_NODE) &&
251 		        (attr->children->next == NULL)) {
252 			value = attr->children->content;
253 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
254 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
255 			    http = 1;
256 			else
257                         {
258                            if ((value != NULL) &&
259 				(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
260 			      content = value;
261                         }
262 		        if ((http != 0) && (content != NULL))
263 			    break;
264 		    }
265 		    attr = attr->next;
266 		}
267 		if ((http != 0) && (content != NULL)) {
268 		    meta = cur;
269 		    cur = cur->next;
270 		    xmlUnlinkNode(meta);
271                     xmlFreeNode(meta);
272 		    continue;
273 		}
274 
275 	    }
276 	}
277 	cur = cur->next;
278     }
279     return(0);
280 }
281 
282 /**
283  * booleanHTMLAttrs:
284  *
285  * These are the HTML attributes which will be output
286  * in minimized form, i.e. <option selected="selected"> will be
287  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
288  *
289  */
290 static const char* htmlBooleanAttrs[] = {
291   "checked", "compact", "declare", "defer", "disabled", "ismap",
292   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
293   "selected", NULL
294 };
295 
296 
297 /**
298  * htmlIsBooleanAttr:
299  * @name:  the name of the attribute to check
300  *
301  * Determine if a given attribute is a boolean attribute.
302  *
303  * returns: false if the attribute is not boolean, true otherwise.
304  */
305 int
htmlIsBooleanAttr(const xmlChar * name)306 htmlIsBooleanAttr(const xmlChar *name)
307 {
308     int i = 0;
309 
310     while (htmlBooleanAttrs[i] != NULL) {
311         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
312             return 1;
313         i++;
314     }
315     return 0;
316 }
317 
318 #ifdef LIBXML_OUTPUT_ENABLED
319 /*
320  * private routine exported from xmlIO.c
321  */
322 xmlOutputBufferPtr
323 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
324 /************************************************************************
325  *									*
326  * 			Output error handlers				*
327  *									*
328  ************************************************************************/
329 /**
330  * htmlSaveErrMemory:
331  * @extra:  extra informations
332  *
333  * Handle an out of memory condition
334  */
335 static void
htmlSaveErrMemory(const char * extra)336 htmlSaveErrMemory(const char *extra)
337 {
338     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
339 }
340 
341 /**
342  * htmlSaveErr:
343  * @code:  the error number
344  * @node:  the location of the error.
345  * @extra:  extra informations
346  *
347  * Handle an out of memory condition
348  */
349 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)350 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
351 {
352     const char *msg = NULL;
353 
354     switch(code) {
355         case XML_SAVE_NOT_UTF8:
356 	    msg = "string is not in UTF-8\n";
357 	    break;
358 	case XML_SAVE_CHAR_INVALID:
359 	    msg = "invalid character value\n";
360 	    break;
361 	case XML_SAVE_UNKNOWN_ENCODING:
362 	    msg = "unknown encoding %s\n";
363 	    break;
364 	case XML_SAVE_NO_DOCTYPE:
365 	    msg = "HTML has no DOCTYPE\n";
366 	    break;
367 	default:
368 	    msg = "unexpected error number\n";
369     }
370     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
371 }
372 
373 /************************************************************************
374  *									*
375  *   		Dumping HTML tree content to a simple buffer		*
376  *									*
377  ************************************************************************/
378 
379 static int
380 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
381 	           int format);
382 
383 /**
384  * htmlNodeDumpFormat:
385  * @buf:  the HTML buffer output
386  * @doc:  the document
387  * @cur:  the current node
388  * @format:  should formatting spaces been added
389  *
390  * Dump an HTML node, recursive behaviour,children are printed too.
391  *
392  * Returns the number of byte written or -1 in case of error
393  */
394 static int
htmlNodeDumpFormat(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)395 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
396 	           int format) {
397     unsigned int use;
398     int ret;
399     xmlOutputBufferPtr outbuf;
400 
401     if (cur == NULL) {
402 	return (-1);
403     }
404     if (buf == NULL) {
405 	return (-1);
406     }
407     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
408     if (outbuf == NULL) {
409         htmlSaveErrMemory("allocating HTML output buffer");
410 	return (-1);
411     }
412     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
413     outbuf->buffer = buf;
414     outbuf->encoder = NULL;
415     outbuf->writecallback = NULL;
416     outbuf->closecallback = NULL;
417     outbuf->context = NULL;
418     outbuf->written = 0;
419 
420     use = buf->use;
421     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
422     xmlFree(outbuf);
423     ret = buf->use - use;
424     return (ret);
425 }
426 
427 /**
428  * htmlNodeDump:
429  * @buf:  the HTML buffer output
430  * @doc:  the document
431  * @cur:  the current node
432  *
433  * Dump an HTML node, recursive behaviour,children are printed too,
434  * and formatting returns are added.
435  *
436  * Returns the number of byte written or -1 in case of error
437  */
438 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)439 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
440     xmlInitParser();
441 
442     return(htmlNodeDumpFormat(buf, doc, cur, 1));
443 }
444 
445 /**
446  * htmlNodeDumpFileFormat:
447  * @out:  the FILE pointer
448  * @doc:  the document
449  * @cur:  the current node
450  * @encoding: the document encoding
451  * @format:  should formatting spaces been added
452  *
453  * Dump an HTML node, recursive behaviour,children are printed too.
454  *
455  * TODO: if encoding == NULL try to save in the doc encoding
456  *
457  * returns: the number of byte written or -1 in case of failure.
458  */
459 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)460 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
461 	               xmlNodePtr cur, const char *encoding, int format) {
462     xmlOutputBufferPtr buf;
463     xmlCharEncodingHandlerPtr handler = NULL;
464     int ret;
465 
466     xmlInitParser();
467 
468     if (encoding != NULL) {
469 	xmlCharEncoding enc;
470 
471 	enc = xmlParseCharEncoding(encoding);
472 	if (enc != XML_CHAR_ENCODING_UTF8) {
473 	    handler = xmlFindCharEncodingHandler(encoding);
474 	    if (handler == NULL)
475 		return(-1);
476 	}
477     }
478 
479     /*
480      * Fallback to HTML or ASCII when the encoding is unspecified
481      */
482     if (handler == NULL)
483 	handler = xmlFindCharEncodingHandler("HTML");
484     if (handler == NULL)
485 	handler = xmlFindCharEncodingHandler("ascii");
486 
487     /*
488      * save the content to a temp buffer.
489      */
490     buf = xmlOutputBufferCreateFile(out, handler);
491     if (buf == NULL) return(0);
492 
493     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
494 
495     ret = xmlOutputBufferClose(buf);
496     return(ret);
497 }
498 
499 /**
500  * htmlNodeDumpFile:
501  * @out:  the FILE pointer
502  * @doc:  the document
503  * @cur:  the current node
504  *
505  * Dump an HTML node, recursive behaviour,children are printed too,
506  * and formatting returns are added.
507  */
508 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)509 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
510     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
511 }
512 
513 /**
514  * htmlDocDumpMemoryFormat:
515  * @cur:  the document
516  * @mem:  OUT: the memory pointer
517  * @size:  OUT: the memory length
518  * @format:  should formatting spaces been added
519  *
520  * Dump an HTML document in memory and return the xmlChar * and it's size.
521  * It's up to the caller to free the memory.
522  */
523 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)524 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
525     xmlOutputBufferPtr buf;
526     xmlCharEncodingHandlerPtr handler = NULL;
527     const char *encoding;
528 
529     xmlInitParser();
530 
531     if ((mem == NULL) || (size == NULL))
532         return;
533     if (cur == NULL) {
534 	*mem = NULL;
535 	*size = 0;
536 	return;
537     }
538 
539     encoding = (const char *) htmlGetMetaEncoding(cur);
540 
541     if (encoding != NULL) {
542 	xmlCharEncoding enc;
543 
544 	enc = xmlParseCharEncoding(encoding);
545 	if (enc != cur->charset) {
546 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
547 		/*
548 		 * Not supported yet
549 		 */
550 		*mem = NULL;
551 		*size = 0;
552 		return;
553 	    }
554 
555 	    handler = xmlFindCharEncodingHandler(encoding);
556 	    if (handler == NULL) {
557 		*mem = NULL;
558 		*size = 0;
559 		return;
560 	    }
561 	} else {
562 	    handler = xmlFindCharEncodingHandler(encoding);
563 	}
564     }
565 
566     /*
567      * Fallback to HTML or ASCII when the encoding is unspecified
568      */
569     if (handler == NULL)
570 	handler = xmlFindCharEncodingHandler("HTML");
571     if (handler == NULL)
572 	handler = xmlFindCharEncodingHandler("ascii");
573 
574     buf = xmlAllocOutputBufferInternal(handler);
575     if (buf == NULL) {
576 	*mem = NULL;
577 	*size = 0;
578 	return;
579     }
580 
581 	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
582 
583     xmlOutputBufferFlush(buf);
584     if (buf->conv != NULL) {
585 	*size = buf->conv->use;
586 	*mem = xmlStrndup(buf->conv->content, *size);
587     } else {
588 	*size = buf->buffer->use;
589 	*mem = xmlStrndup(buf->buffer->content, *size);
590     }
591     (void)xmlOutputBufferClose(buf);
592 }
593 
594 /**
595  * htmlDocDumpMemory:
596  * @cur:  the document
597  * @mem:  OUT: the memory pointer
598  * @size:  OUT: the memory length
599  *
600  * Dump an HTML document in memory and return the xmlChar * and it's size.
601  * It's up to the caller to free the memory.
602  */
603 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)604 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
605 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
606 }
607 
608 
609 /************************************************************************
610  *									*
611  *   		Dumping HTML tree content to an I/O output buffer	*
612  *									*
613  ************************************************************************/
614 
615 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
616 
617 /**
618  * htmlDtdDumpOutput:
619  * @buf:  the HTML buffer output
620  * @doc:  the document
621  * @encoding:  the encoding string
622  *
623  * TODO: check whether encoding is needed
624  *
625  * Dump the HTML document DTD, if any.
626  */
627 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)628 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
629 	          const char *encoding ATTRIBUTE_UNUSED) {
630     xmlDtdPtr cur = doc->intSubset;
631 
632     if (cur == NULL) {
633 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
634 	return;
635     }
636     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
637     xmlOutputBufferWriteString(buf, (const char *)cur->name);
638     if (cur->ExternalID != NULL) {
639 	xmlOutputBufferWriteString(buf, " PUBLIC ");
640 	xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
641 	if (cur->SystemID != NULL) {
642 	    xmlOutputBufferWriteString(buf, " ");
643 	    xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
644 	}
645     }  else if (cur->SystemID != NULL) {
646 	xmlOutputBufferWriteString(buf, " SYSTEM ");
647 	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
648     }
649     xmlOutputBufferWriteString(buf, ">\n");
650 }
651 
652 /**
653  * htmlAttrDumpOutput:
654  * @buf:  the HTML buffer output
655  * @doc:  the document
656  * @cur:  the attribute pointer
657  * @encoding:  the encoding string
658  *
659  * Dump an HTML attribute
660  */
661 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)662 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
663 	           const char *encoding ATTRIBUTE_UNUSED) {
664     xmlChar *value;
665 
666     /*
667      * TODO: The html output method should not escape a & character
668      *       occurring in an attribute value immediately followed by
669      *       a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
670      */
671 
672     if (cur == NULL) {
673 	return;
674     }
675     xmlOutputBufferWriteString(buf, " ");
676     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
677         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
678 	xmlOutputBufferWriteString(buf, ":");
679     }
680     xmlOutputBufferWriteString(buf, (const char *)cur->name);
681     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
682 	value = xmlNodeListGetString(doc, cur->children, 0);
683 	if (value) {
684 	    xmlOutputBufferWriteString(buf, "=");
685 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
686 		(cur->parent->ns == NULL) &&
687 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
688 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
689 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
690 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
691 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
692 		xmlChar *escaped;
693 		xmlChar *tmp = value;
694 
695 		while (IS_BLANK_CH(*tmp)) tmp++;
696 
697 		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
698 		if (escaped != NULL) {
699 		    xmlBufferWriteQuotedString(buf->buffer, escaped);
700 		    xmlFree(escaped);
701 		} else {
702 		    xmlBufferWriteQuotedString(buf->buffer, value);
703 		}
704 	    } else {
705 		xmlBufferWriteQuotedString(buf->buffer, value);
706 	    }
707 	    xmlFree(value);
708 	} else  {
709 	    xmlOutputBufferWriteString(buf, "=\"\"");
710 	}
711     }
712 }
713 
714 /**
715  * htmlAttrListDumpOutput:
716  * @buf:  the HTML buffer output
717  * @doc:  the document
718  * @cur:  the first attribute pointer
719  * @encoding:  the encoding string
720  *
721  * Dump a list of HTML attributes
722  */
723 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)724 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
725     if (cur == NULL) {
726 	return;
727     }
728     while (cur != NULL) {
729         htmlAttrDumpOutput(buf, doc, cur, encoding);
730 	cur = cur->next;
731     }
732 }
733 
734 
735 
736 /**
737  * htmlNodeListDumpOutput:
738  * @buf:  the HTML buffer output
739  * @doc:  the document
740  * @cur:  the first node
741  * @encoding:  the encoding string
742  * @format:  should formatting spaces been added
743  *
744  * Dump an HTML node list, recursive behaviour,children are printed too.
745  */
746 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)747 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
748 	               xmlNodePtr cur, const char *encoding, int format) {
749     if (cur == NULL) {
750 	return;
751     }
752     while (cur != NULL) {
753         htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
754 	cur = cur->next;
755     }
756 }
757 
758 /**
759  * htmlNodeDumpFormatOutput:
760  * @buf:  the HTML buffer output
761  * @doc:  the document
762  * @cur:  the current node
763  * @encoding:  the encoding string
764  * @format:  should formatting spaces been added
765  *
766  * Dump an HTML node, recursive behaviour,children are printed too.
767  */
768 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)769 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
770 	                 xmlNodePtr cur, const char *encoding, int format) {
771     const htmlElemDesc * info;
772 
773     xmlInitParser();
774 
775     if ((cur == NULL) || (buf == NULL)) {
776 	return;
777     }
778     /*
779      * Special cases.
780      */
781     if (cur->type == XML_DTD_NODE)
782 	return;
783     if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
784         (cur->type == XML_DOCUMENT_NODE)){
785 	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
786 	return;
787     }
788     if (cur->type == XML_ATTRIBUTE_NODE) {
789         htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
790 	return;
791     }
792     if (cur->type == HTML_TEXT_NODE) {
793 	if (cur->content != NULL) {
794 	    if (((cur->name == (const xmlChar *)xmlStringText) ||
795 		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
796 		((cur->parent == NULL) ||
797 		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
798 		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
799 		xmlChar *buffer;
800 
801 		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
802 		if (buffer != NULL) {
803 		    xmlOutputBufferWriteString(buf, (const char *)buffer);
804 		    xmlFree(buffer);
805 		}
806 	    } else {
807 		xmlOutputBufferWriteString(buf, (const char *)cur->content);
808 	    }
809 	}
810 	return;
811     }
812     if (cur->type == HTML_COMMENT_NODE) {
813 	if (cur->content != NULL) {
814 	    xmlOutputBufferWriteString(buf, "<!--");
815 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
816 	    xmlOutputBufferWriteString(buf, "-->");
817 	}
818 	return;
819     }
820     if (cur->type == HTML_PI_NODE) {
821 	if (cur->name == NULL)
822 	    return;
823 	xmlOutputBufferWriteString(buf, "<?");
824 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
825 	if (cur->content != NULL) {
826 	    xmlOutputBufferWriteString(buf, " ");
827 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
828 	}
829 	xmlOutputBufferWriteString(buf, ">");
830 	return;
831     }
832     if (cur->type == HTML_ENTITY_REF_NODE) {
833         xmlOutputBufferWriteString(buf, "&");
834 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
835         xmlOutputBufferWriteString(buf, ";");
836 	return;
837     }
838     if (cur->type == HTML_PRESERVE_NODE) {
839 	if (cur->content != NULL) {
840 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
841 	}
842 	return;
843     }
844 
845     /*
846      * Get specific HTML info for that node.
847      */
848     if (cur->ns == NULL)
849 	info = htmlTagLookup(cur->name);
850     else
851 	info = NULL;
852 
853     xmlOutputBufferWriteString(buf, "<");
854     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
855         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
856 	xmlOutputBufferWriteString(buf, ":");
857     }
858     xmlOutputBufferWriteString(buf, (const char *)cur->name);
859     if (cur->nsDef)
860 	xmlNsListDumpOutput(buf, cur->nsDef);
861     if (cur->properties != NULL)
862         htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
863 
864     if ((info != NULL) && (info->empty)) {
865         xmlOutputBufferWriteString(buf, ">");
866 	if ((format) && (!info->isinline) && (cur->next != NULL)) {
867 	    if ((cur->next->type != HTML_TEXT_NODE) &&
868 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
869 		(cur->parent != NULL) &&
870 		(cur->parent->name != NULL) &&
871 		(cur->parent->name[0] != 'p')) /* p, pre, param */
872 		xmlOutputBufferWriteString(buf, "\n");
873 	}
874 	return;
875     }
876     if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
877 	(cur->children == NULL)) {
878         if ((info != NULL) && (info->saveEndTag != 0) &&
879 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
880 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
881 	    xmlOutputBufferWriteString(buf, ">");
882 	} else {
883 	    xmlOutputBufferWriteString(buf, "></");
884             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
885                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
886                 xmlOutputBufferWriteString(buf, ":");
887             }
888 	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
889 	    xmlOutputBufferWriteString(buf, ">");
890 	}
891 	if ((format) && (cur->next != NULL) &&
892             (info != NULL) && (!info->isinline)) {
893 	    if ((cur->next->type != HTML_TEXT_NODE) &&
894 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
895 		(cur->parent != NULL) &&
896 		(cur->parent->name != NULL) &&
897 		(cur->parent->name[0] != 'p')) /* p, pre, param */
898 		xmlOutputBufferWriteString(buf, "\n");
899 	}
900 	return;
901     }
902     xmlOutputBufferWriteString(buf, ">");
903     if ((cur->type != XML_ELEMENT_NODE) &&
904 	(cur->content != NULL)) {
905 	    /*
906 	     * Uses the OutputBuffer property to automatically convert
907 	     * invalids to charrefs
908 	     */
909 
910             xmlOutputBufferWriteString(buf, (const char *) cur->content);
911     }
912     if (cur->children != NULL) {
913         if ((format) && (info != NULL) && (!info->isinline) &&
914 	    (cur->children->type != HTML_TEXT_NODE) &&
915 	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
916 	    (cur->children != cur->last) &&
917 	    (cur->name != NULL) &&
918 	    (cur->name[0] != 'p')) /* p, pre, param */
919 	    xmlOutputBufferWriteString(buf, "\n");
920 	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
921         if ((format) && (info != NULL) && (!info->isinline) &&
922 	    (cur->last->type != HTML_TEXT_NODE) &&
923 	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
924 	    (cur->children != cur->last) &&
925 	    (cur->name != NULL) &&
926 	    (cur->name[0] != 'p')) /* p, pre, param */
927 	    xmlOutputBufferWriteString(buf, "\n");
928     }
929     xmlOutputBufferWriteString(buf, "</");
930     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
931         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
932 	xmlOutputBufferWriteString(buf, ":");
933     }
934     xmlOutputBufferWriteString(buf, (const char *)cur->name);
935     xmlOutputBufferWriteString(buf, ">");
936     if ((format) && (info != NULL) && (!info->isinline) &&
937 	(cur->next != NULL)) {
938         if ((cur->next->type != HTML_TEXT_NODE) &&
939 	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
940 	    (cur->parent != NULL) &&
941 	    (cur->parent->name != NULL) &&
942 	    (cur->parent->name[0] != 'p')) /* p, pre, param */
943 	    xmlOutputBufferWriteString(buf, "\n");
944     }
945 }
946 
947 /**
948  * htmlNodeDumpOutput:
949  * @buf:  the HTML buffer output
950  * @doc:  the document
951  * @cur:  the current node
952  * @encoding:  the encoding string
953  *
954  * Dump an HTML node, recursive behaviour,children are printed too,
955  * and formatting returns/spaces are added.
956  */
957 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)958 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
959 	           xmlNodePtr cur, const char *encoding) {
960     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
961 }
962 
963 /**
964  * htmlDocContentDumpFormatOutput:
965  * @buf:  the HTML buffer output
966  * @cur:  the document
967  * @encoding:  the encoding string
968  * @format:  should formatting spaces been added
969  *
970  * Dump an HTML document.
971  */
972 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)973 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
974 	                       const char *encoding, int format) {
975     int type;
976 
977     xmlInitParser();
978 
979     if ((buf == NULL) || (cur == NULL))
980         return;
981 
982     /*
983      * force to output the stuff as HTML, especially for entities
984      */
985     type = cur->type;
986     cur->type = XML_HTML_DOCUMENT_NODE;
987     if (cur->intSubset != NULL) {
988         htmlDtdDumpOutput(buf, cur, NULL);
989     }
990     if (cur->children != NULL) {
991         htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
992     }
993     xmlOutputBufferWriteString(buf, "\n");
994     cur->type = (xmlElementType) type;
995 }
996 
997 /**
998  * htmlDocContentDumpOutput:
999  * @buf:  the HTML buffer output
1000  * @cur:  the document
1001  * @encoding:  the encoding string
1002  *
1003  * Dump an HTML document. Formating return/spaces are added.
1004  */
1005 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1006 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1007 	                 const char *encoding) {
1008     htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1009 }
1010 
1011 /************************************************************************
1012  *									*
1013  *		Saving functions front-ends				*
1014  *									*
1015  ************************************************************************/
1016 
1017 /**
1018  * htmlDocDump:
1019  * @f:  the FILE*
1020  * @cur:  the document
1021  *
1022  * Dump an HTML document to an open FILE.
1023  *
1024  * returns: the number of byte written or -1 in case of failure.
1025  */
1026 int
htmlDocDump(FILE * f,xmlDocPtr cur)1027 htmlDocDump(FILE *f, xmlDocPtr cur) {
1028     xmlOutputBufferPtr buf;
1029     xmlCharEncodingHandlerPtr handler = NULL;
1030     const char *encoding;
1031     int ret;
1032 
1033     xmlInitParser();
1034 
1035     if ((cur == NULL) || (f == NULL)) {
1036 	return(-1);
1037     }
1038 
1039     encoding = (const char *) htmlGetMetaEncoding(cur);
1040 
1041     if (encoding != NULL) {
1042 	xmlCharEncoding enc;
1043 
1044 	enc = xmlParseCharEncoding(encoding);
1045 	if (enc != cur->charset) {
1046 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1047 		/*
1048 		 * Not supported yet
1049 		 */
1050 		return(-1);
1051 	    }
1052 
1053 	    handler = xmlFindCharEncodingHandler(encoding);
1054 	    if (handler == NULL)
1055 		return(-1);
1056 	} else {
1057 	    handler = xmlFindCharEncodingHandler(encoding);
1058 	}
1059     }
1060 
1061     /*
1062      * Fallback to HTML or ASCII when the encoding is unspecified
1063      */
1064     if (handler == NULL)
1065 	handler = xmlFindCharEncodingHandler("HTML");
1066     if (handler == NULL)
1067 	handler = xmlFindCharEncodingHandler("ascii");
1068 
1069     buf = xmlOutputBufferCreateFile(f, handler);
1070     if (buf == NULL) return(-1);
1071     htmlDocContentDumpOutput(buf, cur, NULL);
1072 
1073     ret = xmlOutputBufferClose(buf);
1074     return(ret);
1075 }
1076 
1077 /**
1078  * htmlSaveFile:
1079  * @filename:  the filename (or URL)
1080  * @cur:  the document
1081  *
1082  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1083  * used.
1084  * returns: the number of byte written or -1 in case of failure.
1085  */
1086 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1087 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1088     xmlOutputBufferPtr buf;
1089     xmlCharEncodingHandlerPtr handler = NULL;
1090     const char *encoding;
1091     int ret;
1092 
1093     if ((cur == NULL) || (filename == NULL))
1094         return(-1);
1095 
1096     xmlInitParser();
1097 
1098     encoding = (const char *) htmlGetMetaEncoding(cur);
1099 
1100     if (encoding != NULL) {
1101 	xmlCharEncoding enc;
1102 
1103 	enc = xmlParseCharEncoding(encoding);
1104 	if (enc != cur->charset) {
1105 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1106 		/*
1107 		 * Not supported yet
1108 		 */
1109 		return(-1);
1110 	    }
1111 
1112 	    handler = xmlFindCharEncodingHandler(encoding);
1113 	    if (handler == NULL)
1114 		return(-1);
1115 	}
1116     }
1117 
1118     /*
1119      * Fallback to HTML or ASCII when the encoding is unspecified
1120      */
1121     if (handler == NULL)
1122 	handler = xmlFindCharEncodingHandler("HTML");
1123     if (handler == NULL)
1124 	handler = xmlFindCharEncodingHandler("ascii");
1125 
1126     /*
1127      * save the content to a temp buffer.
1128      */
1129     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1130     if (buf == NULL) return(0);
1131 
1132     htmlDocContentDumpOutput(buf, cur, NULL);
1133 
1134     ret = xmlOutputBufferClose(buf);
1135     return(ret);
1136 }
1137 
1138 /**
1139  * htmlSaveFileFormat:
1140  * @filename:  the filename
1141  * @cur:  the document
1142  * @format:  should formatting spaces been added
1143  * @encoding: the document encoding
1144  *
1145  * Dump an HTML document to a file using a given encoding.
1146  *
1147  * returns: the number of byte written or -1 in case of failure.
1148  */
1149 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1150 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1151 	           const char *encoding, int format) {
1152     xmlOutputBufferPtr buf;
1153     xmlCharEncodingHandlerPtr handler = NULL;
1154     int ret;
1155 
1156     if ((cur == NULL) || (filename == NULL))
1157         return(-1);
1158 
1159     xmlInitParser();
1160 
1161     if (encoding != NULL) {
1162 	xmlCharEncoding enc;
1163 
1164 	enc = xmlParseCharEncoding(encoding);
1165 	if (enc != cur->charset) {
1166 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1167 		/*
1168 		 * Not supported yet
1169 		 */
1170 		return(-1);
1171 	    }
1172 
1173 	    handler = xmlFindCharEncodingHandler(encoding);
1174 	    if (handler == NULL)
1175 		return(-1);
1176             htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1177 	}
1178     } else {
1179 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1180     }
1181 
1182     /*
1183      * Fallback to HTML or ASCII when the encoding is unspecified
1184      */
1185     if (handler == NULL)
1186 	handler = xmlFindCharEncodingHandler("HTML");
1187     if (handler == NULL)
1188 	handler = xmlFindCharEncodingHandler("ascii");
1189 
1190     /*
1191      * save the content to a temp buffer.
1192      */
1193     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1194     if (buf == NULL) return(0);
1195 
1196     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1197 
1198     ret = xmlOutputBufferClose(buf);
1199     return(ret);
1200 }
1201 
1202 /**
1203  * htmlSaveFileEnc:
1204  * @filename:  the filename
1205  * @cur:  the document
1206  * @encoding: the document encoding
1207  *
1208  * Dump an HTML document to a file using a given encoding
1209  * and formatting returns/spaces are added.
1210  *
1211  * returns: the number of byte written or -1 in case of failure.
1212  */
1213 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1214 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1215     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1216 }
1217 
1218 #endif /* LIBXML_OUTPUT_ENABLED */
1219 
1220 #define bottom_HTMLtree
1221 #include "elfgcchack.h"
1222 #endif /* LIBXML_HTML_ENABLED */
1223