• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17 
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25 
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/parser.h"
30 #include "private/save.h"
31 
32 /************************************************************************
33  *									*
34  *		Getting/Setting encoding meta tags			*
35  *									*
36  ************************************************************************/
37 
38 /**
39  * htmlGetMetaEncoding:
40  * @doc:  the document
41  *
42  * Encoding definition lookup in the Meta tags
43  *
44  * Returns the current encoding as flagged in the HTML source
45  */
46 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)47 htmlGetMetaEncoding(htmlDocPtr doc) {
48     htmlNodePtr cur;
49     const xmlChar *content;
50     const xmlChar *encoding;
51 
52     if (doc == NULL)
53 	return(NULL);
54     cur = doc->children;
55 
56     /*
57      * Search the html
58      */
59     while (cur != NULL) {
60 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
61 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
62 		break;
63 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
64 		goto found_head;
65 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
66 		goto found_meta;
67 	}
68 	cur = cur->next;
69     }
70     if (cur == NULL)
71 	return(NULL);
72     cur = cur->children;
73 
74     /*
75      * Search the head
76      */
77     while (cur != NULL) {
78 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
79 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
80 		break;
81 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
82 		goto found_meta;
83 	}
84 	cur = cur->next;
85     }
86     if (cur == NULL)
87 	return(NULL);
88 found_head:
89     cur = cur->children;
90 
91     /*
92      * Search the meta elements
93      */
94 found_meta:
95     while (cur != NULL) {
96 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
97 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
98 		xmlAttrPtr attr = cur->properties;
99 		int http;
100 		const xmlChar *value;
101 
102 		content = NULL;
103 		http = 0;
104 		while (attr != NULL) {
105 		    if ((attr->children != NULL) &&
106 		        (attr->children->type == XML_TEXT_NODE) &&
107 		        (attr->children->next == NULL)) {
108 			value = attr->children->content;
109 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
110 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
111 			    http = 1;
112 			else if ((value != NULL)
113 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
114 			    content = value;
115 			if ((http != 0) && (content != NULL))
116 			    goto found_content;
117 		    }
118 		    attr = attr->next;
119 		}
120 	    }
121 	}
122 	cur = cur->next;
123     }
124     return(NULL);
125 
126 found_content:
127     encoding = xmlStrstr(content, BAD_CAST"charset=");
128     if (encoding == NULL)
129 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
130     if (encoding == NULL)
131 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
132     if (encoding != NULL) {
133 	encoding += 8;
134     } else {
135 	encoding = xmlStrstr(content, BAD_CAST"charset =");
136 	if (encoding == NULL)
137 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
138 	if (encoding == NULL)
139 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
140 	if (encoding != NULL)
141 	    encoding += 9;
142     }
143     if (encoding != NULL) {
144 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
145     }
146     return(encoding);
147 }
148 
149 /**
150  * htmlSetMetaEncoding:
151  * @doc:  the document
152  * @encoding:  the encoding string
153  *
154  * Sets the current encoding in the Meta tags
155  * NOTE: this will not change the document content encoding, just
156  * the META flag associated.
157  *
158  * Returns 0 in case of success and -1 in case of error
159  */
160 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)161 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
162     htmlNodePtr cur, meta = NULL, head = NULL;
163     const xmlChar *content = NULL;
164     char newcontent[100];
165 
166     newcontent[0] = 0;
167 
168     if (doc == NULL)
169 	return(-1);
170 
171     /* html isn't a real encoding it's just libxml2 way to get entities */
172     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
173         return(-1);
174 
175     if (encoding != NULL) {
176 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
177                 (char *)encoding);
178 	newcontent[sizeof(newcontent) - 1] = 0;
179     }
180 
181     cur = doc->children;
182 
183     /*
184      * Search the html
185      */
186     while (cur != NULL) {
187 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
188 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
189 		break;
190 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
191 		goto found_head;
192 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
193 		goto found_meta;
194 	}
195 	cur = cur->next;
196     }
197     if (cur == NULL)
198 	return(-1);
199     cur = cur->children;
200 
201     /*
202      * Search the head
203      */
204     while (cur != NULL) {
205 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
206 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
207 		break;
208 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
209                 head = cur->parent;
210 		goto found_meta;
211             }
212 	}
213 	cur = cur->next;
214     }
215     if (cur == NULL)
216 	return(-1);
217 found_head:
218     head = cur;
219     if (cur->children == NULL)
220         goto create;
221     cur = cur->children;
222 
223 found_meta:
224     /*
225      * Search and update all the remaining the meta elements carrying
226      * encoding information
227      */
228     while (cur != NULL) {
229 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
230 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
231 		xmlAttrPtr attr = cur->properties;
232 		int http;
233 		const xmlChar *value;
234 
235 		content = NULL;
236 		http = 0;
237 		while (attr != NULL) {
238 		    if ((attr->children != NULL) &&
239 		        (attr->children->type == XML_TEXT_NODE) &&
240 		        (attr->children->next == NULL)) {
241 			value = attr->children->content;
242 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
243 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
244 			    http = 1;
245 			else
246                         {
247                            if ((value != NULL) &&
248                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
249 			       content = value;
250                         }
251 		        if ((http != 0) && (content != NULL))
252 			    break;
253 		    }
254 		    attr = attr->next;
255 		}
256 		if ((http != 0) && (content != NULL)) {
257 		    meta = cur;
258 		    break;
259 		}
260 
261 	    }
262 	}
263 	cur = cur->next;
264     }
265 create:
266     if (meta == NULL) {
267         if ((encoding != NULL) && (head != NULL)) {
268             /*
269              * Create a new Meta element with the right attributes
270              */
271 
272             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
273             if (head->children == NULL)
274                 xmlAddChild(head, meta);
275             else
276                 xmlAddPrevSibling(head->children, meta);
277             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
278             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
279         }
280     } else {
281         /* remove the meta tag if NULL is passed */
282         if (encoding == NULL) {
283             xmlUnlinkNode(meta);
284             xmlFreeNode(meta);
285         }
286         /* change the document only if there is a real encoding change */
287         else if (xmlStrcasestr(content, encoding) == NULL) {
288             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
289         }
290     }
291 
292 
293     return(0);
294 }
295 
296 /**
297  * booleanHTMLAttrs:
298  *
299  * These are the HTML attributes which will be output
300  * in minimized form, i.e. <option selected="selected"> will be
301  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
302  *
303  */
304 static const char* const htmlBooleanAttrs[] = {
305   "checked", "compact", "declare", "defer", "disabled", "ismap",
306   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
307   "selected", NULL
308 };
309 
310 
311 /**
312  * htmlIsBooleanAttr:
313  * @name:  the name of the attribute to check
314  *
315  * DEPRECATED: Internal function, don't use.
316  *
317  * Determine if a given attribute is a boolean attribute.
318  *
319  * returns: false if the attribute is not boolean, true otherwise.
320  */
321 int
htmlIsBooleanAttr(const xmlChar * name)322 htmlIsBooleanAttr(const xmlChar *name)
323 {
324     int i = 0;
325 
326     while (htmlBooleanAttrs[i] != NULL) {
327         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
328             return 1;
329         i++;
330     }
331     return 0;
332 }
333 
334 #ifdef LIBXML_OUTPUT_ENABLED
335 /************************************************************************
336  *									*
337  *			Output error handlers				*
338  *									*
339  ************************************************************************/
340 
341 /**
342  * htmlSaveErr:
343  * @code:  the error number
344  * @node:  the location of the error.
345  * @extra:  extra information
346  *
347  * Handle an out of memory condition
348  */
349 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)350 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
351 {
352     const char *msg = NULL;
353     int res;
354 
355     switch(code) {
356         case XML_SAVE_NOT_UTF8:
357 	    msg = "string is not in UTF-8\n";
358 	    break;
359 	case XML_SAVE_CHAR_INVALID:
360 	    msg = "invalid character value\n";
361 	    break;
362 	case XML_SAVE_UNKNOWN_ENCODING:
363 	    msg = "unknown encoding %s\n";
364 	    break;
365 	case XML_SAVE_NO_DOCTYPE:
366 	    msg = "HTML has no DOCTYPE\n";
367 	    break;
368 	default:
369 	    msg = "unexpected error number\n";
370     }
371 
372     res = xmlRaiseError(NULL, NULL, NULL, NULL, node,
373                         XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
374                         extra, NULL, NULL, 0, 0,
375                         msg, extra);
376     if (res < 0)
377         xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
378 }
379 
380 /************************************************************************
381  *									*
382  *		Dumping HTML tree content to a simple buffer		*
383  *									*
384  ************************************************************************/
385 
386 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)387 htmlFindOutputEncoder(const char *encoding) {
388     xmlCharEncodingHandler *handler = NULL;
389 
390     if (encoding != NULL) {
391         int res;
392 
393         res = xmlOpenCharEncodingHandler(encoding, /* output */ 1,
394                                          &handler);
395         if (res != XML_ERR_OK)
396             htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
397     } else {
398         /*
399          * Fallback to HTML when the encoding is unspecified
400          */
401         xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
402     }
403 
404     return(handler);
405 }
406 
407 /**
408  * htmlBufNodeDumpFormat:
409  * @buf:  the xmlBufPtr output
410  * @doc:  the document
411  * @cur:  the current node
412  * @format:  should formatting spaces been added
413  *
414  * Dump an HTML node, recursive behaviour,children are printed too.
415  *
416  * Returns the number of byte written or -1 in case of error
417  */
418 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)419 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
420 	           int format) {
421     size_t use;
422     size_t ret;
423     xmlOutputBufferPtr outbuf;
424 
425     if (cur == NULL) {
426 	return ((size_t) -1);
427     }
428     if (buf == NULL) {
429 	return ((size_t) -1);
430     }
431     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
432     if (outbuf == NULL)
433 	return ((size_t) -1);
434     memset(outbuf, 0, sizeof(xmlOutputBuffer));
435     outbuf->buffer = buf;
436     outbuf->encoder = NULL;
437     outbuf->writecallback = NULL;
438     outbuf->closecallback = NULL;
439     outbuf->context = NULL;
440     outbuf->written = 0;
441 
442     use = xmlBufUse(buf);
443     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
444     if (outbuf->error)
445         ret = (size_t) -1;
446     else
447         ret = xmlBufUse(buf) - use;
448     xmlFree(outbuf);
449     return (ret);
450 }
451 
452 /**
453  * htmlNodeDump:
454  * @buf:  the HTML buffer output
455  * @doc:  the document
456  * @cur:  the current node
457  *
458  * Dump an HTML node, recursive behaviour,children are printed too,
459  * and formatting returns are added.
460  *
461  * Returns the number of byte written or -1 in case of error
462  */
463 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)464 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
465     xmlBufPtr buffer;
466     size_t ret1;
467     int ret2;
468 
469     if ((buf == NULL) || (cur == NULL))
470         return(-1);
471 
472     xmlInitParser();
473     buffer = xmlBufFromBuffer(buf);
474     if (buffer == NULL)
475         return(-1);
476 
477     ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
478 
479     ret2 = xmlBufBackToBuffer(buffer, buf);
480 
481     if ((ret1 == (size_t) -1) || (ret2 < 0))
482         return(-1);
483     return(ret1 > INT_MAX ? INT_MAX : ret1);
484 }
485 
486 /**
487  * htmlNodeDumpFileFormat:
488  * @out:  the FILE pointer
489  * @doc:  the document
490  * @cur:  the current node
491  * @encoding: the document encoding
492  * @format:  should formatting spaces been added
493  *
494  * Dump an HTML node, recursive behaviour,children are printed too.
495  *
496  * TODO: if encoding == NULL try to save in the doc encoding
497  *
498  * returns: the number of byte written or -1 in case of failure.
499  */
500 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)501 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
502 	               xmlNodePtr cur, const char *encoding, int format) {
503     xmlOutputBufferPtr buf;
504     xmlCharEncodingHandlerPtr handler;
505     int ret;
506 
507     xmlInitParser();
508 
509     /*
510      * save the content to a temp buffer.
511      */
512     handler = htmlFindOutputEncoder(encoding);
513     buf = xmlOutputBufferCreateFile(out, handler);
514     if (buf == NULL)
515         return(0);
516 
517     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
518 
519     ret = xmlOutputBufferClose(buf);
520     return(ret);
521 }
522 
523 /**
524  * htmlNodeDumpFile:
525  * @out:  the FILE pointer
526  * @doc:  the document
527  * @cur:  the current node
528  *
529  * Dump an HTML node, recursive behaviour,children are printed too,
530  * and formatting returns are added.
531  */
532 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)533 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
534     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
535 }
536 
537 /**
538  * htmlDocDumpMemoryFormat:
539  * @cur:  the document
540  * @mem:  OUT: the memory pointer
541  * @size:  OUT: the memory length
542  * @format:  should formatting spaces been added
543  *
544  * Dump an HTML document in memory and return the xmlChar * and it's size.
545  * It's up to the caller to free the memory.
546  */
547 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)548 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
549     xmlOutputBufferPtr buf;
550     xmlCharEncodingHandlerPtr handler = NULL;
551     const char *encoding;
552 
553     xmlInitParser();
554 
555     if ((mem == NULL) || (size == NULL))
556         return;
557     *mem = NULL;
558     *size = 0;
559     if (cur == NULL)
560 	return;
561 
562     encoding = (const char *) htmlGetMetaEncoding(cur);
563     handler = htmlFindOutputEncoder(encoding);
564     buf = xmlAllocOutputBuffer(handler);
565     if (buf == NULL)
566 	return;
567 
568     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
569 
570     xmlOutputBufferFlush(buf);
571 
572     if (!buf->error) {
573         if (buf->conv != NULL) {
574             *size = xmlBufUse(buf->conv);
575             *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
576         } else {
577             *size = xmlBufUse(buf->buffer);
578             *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
579         }
580     }
581 
582     xmlOutputBufferClose(buf);
583 }
584 
585 /**
586  * htmlDocDumpMemory:
587  * @cur:  the document
588  * @mem:  OUT: the memory pointer
589  * @size:  OUT: the memory length
590  *
591  * Dump an HTML document in memory and return the xmlChar * and it's size.
592  * It's up to the caller to free the memory.
593  */
594 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)595 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
596 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
597 }
598 
599 
600 /************************************************************************
601  *									*
602  *		Dumping HTML tree content to an I/O output buffer	*
603  *									*
604  ************************************************************************/
605 
606 /**
607  * htmlDtdDumpOutput:
608  * @buf:  the HTML buffer output
609  * @doc:  the document
610  * @encoding:  the encoding string
611  *
612  * TODO: check whether encoding is needed
613  *
614  * Dump the HTML document DTD, if any.
615  */
616 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)617 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
618 	          const char *encoding ATTRIBUTE_UNUSED) {
619     xmlDtdPtr cur = doc->intSubset;
620 
621     if (cur == NULL) {
622 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
623 	return;
624     }
625     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
626     xmlOutputBufferWriteString(buf, (const char *)cur->name);
627     if (cur->ExternalID != NULL) {
628 	xmlOutputBufferWriteString(buf, " PUBLIC ");
629 	xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
630 	if (cur->SystemID != NULL) {
631 	    xmlOutputBufferWriteString(buf, " ");
632 	    xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
633 	}
634     } else if (cur->SystemID != NULL &&
635 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
636 	xmlOutputBufferWriteString(buf, " SYSTEM ");
637 	xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
638     }
639     xmlOutputBufferWriteString(buf, ">\n");
640 }
641 
642 /**
643  * htmlAttrDumpOutput:
644  * @buf:  the HTML buffer output
645  * @doc:  the document
646  * @cur:  the attribute pointer
647  *
648  * Dump an HTML attribute
649  */
650 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)651 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
652     xmlChar *value;
653 
654     /*
655      * The html output method should not escape a & character
656      * occurring in an attribute value immediately followed by
657      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
658      * This is implemented in xmlEncodeEntitiesReentrant
659      */
660 
661     if (cur == NULL) {
662 	return;
663     }
664     xmlOutputBufferWriteString(buf, " ");
665     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
666         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
667 	xmlOutputBufferWriteString(buf, ":");
668     }
669     xmlOutputBufferWriteString(buf, (const char *)cur->name);
670     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
671 	value = xmlNodeListGetString(doc, cur->children, 0);
672 	if (value) {
673 	    xmlOutputBufferWriteString(buf, "=");
674 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
675 		(cur->parent->ns == NULL) &&
676 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
677 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
678 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
679 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
680 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
681 		xmlChar *escaped;
682 		xmlChar *tmp = value;
683 
684 		while (IS_BLANK_CH(*tmp)) tmp++;
685 
686 		/*
687                  * Angle brackets are technically illegal in URIs, but they're
688                  * used in server side includes, for example. Curly brackets
689                  * are illegal as well and often used in templates.
690                  * Don't escape non-whitespace, printable ASCII chars for
691                  * improved interoperability. Only escape space, control
692                  * and non-ASCII chars.
693 		 */
694 		escaped = xmlURIEscapeStr(tmp,
695                         BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
696 		if (escaped != NULL) {
697 		    xmlOutputBufferWriteQuotedString(buf, escaped);
698 		    xmlFree(escaped);
699 		} else {
700                     buf->error = XML_ERR_NO_MEMORY;
701 		}
702 	    } else {
703 		xmlOutputBufferWriteQuotedString(buf, value);
704 	    }
705 	    xmlFree(value);
706 	} else  {
707             buf->error = XML_ERR_NO_MEMORY;
708 	}
709     }
710 }
711 
712 /**
713  * htmlNodeDumpFormatOutput:
714  * @buf:  the HTML buffer output
715  * @doc:  the document
716  * @cur:  the current node
717  * @encoding:  the encoding string (unused)
718  * @format:  should formatting spaces been added
719  *
720  * Dump an HTML node, recursive behaviour,children are printed too.
721  */
722 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)723 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
724 	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
725                          int format) {
726     xmlNodePtr root, parent;
727     xmlAttrPtr attr;
728     const htmlElemDesc * info;
729 
730     xmlInitParser();
731 
732     if ((cur == NULL) || (buf == NULL)) {
733 	return;
734     }
735 
736     root = cur;
737     parent = cur->parent;
738     while (1) {
739         switch (cur->type) {
740         case XML_HTML_DOCUMENT_NODE:
741         case XML_DOCUMENT_NODE:
742             if (((xmlDocPtr) cur)->intSubset != NULL) {
743                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
744             }
745             if (cur->children != NULL) {
746                 /* Always validate cur->parent when descending. */
747                 if (cur->parent == parent) {
748                     parent = cur;
749                     cur = cur->children;
750                     continue;
751                 }
752             } else {
753                 xmlOutputBufferWriteString(buf, "\n");
754             }
755             break;
756 
757         case XML_ELEMENT_NODE:
758             /*
759              * Some users like lxml are known to pass nodes with a corrupted
760              * tree structure. Fall back to a recursive call to handle this
761              * case.
762              */
763             if ((cur->parent != parent) && (cur->children != NULL)) {
764                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
765                 break;
766             }
767 
768             /*
769              * Get specific HTML info for that node.
770              */
771             if (cur->ns == NULL)
772                 info = htmlTagLookup(cur->name);
773             else
774                 info = NULL;
775 
776             xmlOutputBufferWriteString(buf, "<");
777             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
778                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
779                 xmlOutputBufferWriteString(buf, ":");
780             }
781             xmlOutputBufferWriteString(buf, (const char *)cur->name);
782             if (cur->nsDef)
783                 xmlNsListDumpOutput(buf, cur->nsDef);
784             attr = cur->properties;
785             while (attr != NULL) {
786                 htmlAttrDumpOutput(buf, doc, attr);
787                 attr = attr->next;
788             }
789 
790             if ((info != NULL) && (info->empty)) {
791                 xmlOutputBufferWriteString(buf, ">");
792             } else if (cur->children == NULL) {
793                 if ((info != NULL) && (info->saveEndTag != 0) &&
794                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
795                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
796                     xmlOutputBufferWriteString(buf, ">");
797                 } else {
798                     xmlOutputBufferWriteString(buf, "></");
799                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
800                         xmlOutputBufferWriteString(buf,
801                                 (const char *)cur->ns->prefix);
802                         xmlOutputBufferWriteString(buf, ":");
803                     }
804                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
805                     xmlOutputBufferWriteString(buf, ">");
806                 }
807             } else {
808                 xmlOutputBufferWriteString(buf, ">");
809                 if ((format) && (info != NULL) && (!info->isinline) &&
810                     (cur->children->type != HTML_TEXT_NODE) &&
811                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
812                     (cur->children != cur->last) &&
813                     (cur->name != NULL) &&
814                     (cur->name[0] != 'p')) /* p, pre, param */
815                     xmlOutputBufferWriteString(buf, "\n");
816                 parent = cur;
817                 cur = cur->children;
818                 continue;
819             }
820 
821             if ((format) && (cur->next != NULL) &&
822                 (info != NULL) && (!info->isinline)) {
823                 if ((cur->next->type != HTML_TEXT_NODE) &&
824                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
825                     (parent != NULL) &&
826                     (parent->name != NULL) &&
827                     (parent->name[0] != 'p')) /* p, pre, param */
828                     xmlOutputBufferWriteString(buf, "\n");
829             }
830 
831             break;
832 
833         case XML_ATTRIBUTE_NODE:
834             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
835             break;
836 
837         case HTML_TEXT_NODE:
838             if (cur->content == NULL)
839                 break;
840             if (((cur->name == (const xmlChar *)xmlStringText) ||
841                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
842                 ((parent == NULL) ||
843                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
844                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
845                 xmlChar *buffer;
846 
847                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
848                 if (buffer == NULL) {
849                     buf->error = XML_ERR_NO_MEMORY;
850                     return;
851                 }
852                 xmlOutputBufferWriteString(buf, (const char *)buffer);
853                 xmlFree(buffer);
854             } else {
855                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
856             }
857             break;
858 
859         case HTML_COMMENT_NODE:
860             if (cur->content != NULL) {
861                 xmlOutputBufferWriteString(buf, "<!--");
862                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
863                 xmlOutputBufferWriteString(buf, "-->");
864             }
865             break;
866 
867         case HTML_PI_NODE:
868             if (cur->name != NULL) {
869                 xmlOutputBufferWriteString(buf, "<?");
870                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
871                 if (cur->content != NULL) {
872                     xmlOutputBufferWriteString(buf, " ");
873                     xmlOutputBufferWriteString(buf,
874                             (const char *)cur->content);
875                 }
876                 xmlOutputBufferWriteString(buf, ">");
877             }
878             break;
879 
880         case HTML_ENTITY_REF_NODE:
881             xmlOutputBufferWriteString(buf, "&");
882             xmlOutputBufferWriteString(buf, (const char *)cur->name);
883             xmlOutputBufferWriteString(buf, ";");
884             break;
885 
886         case HTML_PRESERVE_NODE:
887             if (cur->content != NULL) {
888                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
889             }
890             break;
891 
892         default:
893             break;
894         }
895 
896         while (1) {
897             if (cur == root)
898                 return;
899             if (cur->next != NULL) {
900                 cur = cur->next;
901                 break;
902             }
903 
904             cur = parent;
905             /* cur->parent was validated when descending. */
906             parent = cur->parent;
907 
908             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
909                 (cur->type == XML_DOCUMENT_NODE)) {
910                 xmlOutputBufferWriteString(buf, "\n");
911             } else {
912                 if ((format) && (cur->ns == NULL))
913                     info = htmlTagLookup(cur->name);
914                 else
915                     info = NULL;
916 
917                 if ((format) && (info != NULL) && (!info->isinline) &&
918                     (cur->last->type != HTML_TEXT_NODE) &&
919                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
920                     (cur->children != cur->last) &&
921                     (cur->name != NULL) &&
922                     (cur->name[0] != 'p')) /* p, pre, param */
923                     xmlOutputBufferWriteString(buf, "\n");
924 
925                 xmlOutputBufferWriteString(buf, "</");
926                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
927                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
928                     xmlOutputBufferWriteString(buf, ":");
929                 }
930                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
931                 xmlOutputBufferWriteString(buf, ">");
932 
933                 if ((format) && (info != NULL) && (!info->isinline) &&
934                     (cur->next != NULL)) {
935                     if ((cur->next->type != HTML_TEXT_NODE) &&
936                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
937                         (parent != NULL) &&
938                         (parent->name != NULL) &&
939                         (parent->name[0] != 'p')) /* p, pre, param */
940                         xmlOutputBufferWriteString(buf, "\n");
941                 }
942             }
943         }
944     }
945 }
946 
947 /**
948  * htmlNodeDumpOutput:
949  * @buf:  the HTML buffer output
950  * @doc:  the document
951  * @cur:  the current node
952  * @encoding:  the encoding string (unused)
953  *
954  * Dump an HTML node, recursive behaviour,children are printed too,
955  * and formatting returns/spaces are added.
956  */
957 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)958 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
959 	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
960     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
961 }
962 
963 /**
964  * htmlDocContentDumpFormatOutput:
965  * @buf:  the HTML buffer output
966  * @cur:  the document
967  * @encoding:  the encoding string (unused)
968  * @format:  should formatting spaces been added
969  *
970  * Dump an HTML document.
971  */
972 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)973 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
974 	                       const char *encoding ATTRIBUTE_UNUSED,
975                                int format) {
976     int type = 0;
977     if (cur) {
978         type = cur->type;
979         cur->type = XML_HTML_DOCUMENT_NODE;
980     }
981     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
982     if (cur)
983         cur->type = (xmlElementType) type;
984 }
985 
986 /**
987  * htmlDocContentDumpOutput:
988  * @buf:  the HTML buffer output
989  * @cur:  the document
990  * @encoding:  the encoding string (unused)
991  *
992  * Dump an HTML document. Formatting return/spaces are added.
993  */
994 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)995 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
996 	                 const char *encoding ATTRIBUTE_UNUSED) {
997     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
998 }
999 
1000 /************************************************************************
1001  *									*
1002  *		Saving functions front-ends				*
1003  *									*
1004  ************************************************************************/
1005 
1006 /**
1007  * htmlDocDump:
1008  * @f:  the FILE*
1009  * @cur:  the document
1010  *
1011  * Dump an HTML document to an open FILE.
1012  *
1013  * returns: the number of byte written or -1 in case of failure.
1014  */
1015 int
htmlDocDump(FILE * f,xmlDocPtr cur)1016 htmlDocDump(FILE *f, xmlDocPtr cur) {
1017     xmlOutputBufferPtr buf;
1018     xmlCharEncodingHandlerPtr handler = NULL;
1019     const char *encoding;
1020     int ret;
1021 
1022     xmlInitParser();
1023 
1024     if ((cur == NULL) || (f == NULL)) {
1025 	return(-1);
1026     }
1027 
1028     encoding = (const char *) htmlGetMetaEncoding(cur);
1029     handler = htmlFindOutputEncoder(encoding);
1030     buf = xmlOutputBufferCreateFile(f, handler);
1031     if (buf == NULL)
1032         return(-1);
1033     htmlDocContentDumpOutput(buf, cur, NULL);
1034 
1035     ret = xmlOutputBufferClose(buf);
1036     return(ret);
1037 }
1038 
1039 /**
1040  * htmlSaveFile:
1041  * @filename:  the filename (or URL)
1042  * @cur:  the document
1043  *
1044  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1045  * used.
1046  * returns: the number of byte written or -1 in case of failure.
1047  */
1048 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1049 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1050     xmlOutputBufferPtr buf;
1051     xmlCharEncodingHandlerPtr handler = NULL;
1052     const char *encoding;
1053     int ret;
1054 
1055     if ((cur == NULL) || (filename == NULL))
1056         return(-1);
1057 
1058     xmlInitParser();
1059 
1060     encoding = (const char *) htmlGetMetaEncoding(cur);
1061     handler = htmlFindOutputEncoder(encoding);
1062     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1063     if (buf == NULL)
1064         return(0);
1065 
1066     htmlDocContentDumpOutput(buf, cur, NULL);
1067 
1068     ret = xmlOutputBufferClose(buf);
1069     return(ret);
1070 }
1071 
1072 /**
1073  * htmlSaveFileFormat:
1074  * @filename:  the filename
1075  * @cur:  the document
1076  * @format:  should formatting spaces been added
1077  * @encoding: the document encoding
1078  *
1079  * Dump an HTML document to a file using a given encoding.
1080  *
1081  * returns: the number of byte written or -1 in case of failure.
1082  */
1083 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1084 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1085 	           const char *encoding, int format) {
1086     xmlOutputBufferPtr buf;
1087     xmlCharEncodingHandlerPtr handler = NULL;
1088     int ret;
1089 
1090     if ((cur == NULL) || (filename == NULL))
1091         return(-1);
1092 
1093     xmlInitParser();
1094 
1095     handler = htmlFindOutputEncoder(encoding);
1096     if (handler != NULL)
1097         htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1098     else
1099 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1100 
1101     /*
1102      * save the content to a temp buffer.
1103      */
1104     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1105     if (buf == NULL)
1106         return(0);
1107 
1108     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1109 
1110     ret = xmlOutputBufferClose(buf);
1111     return(ret);
1112 }
1113 
1114 /**
1115  * htmlSaveFileEnc:
1116  * @filename:  the filename
1117  * @cur:  the document
1118  * @encoding: the document encoding
1119  *
1120  * Dump an HTML document to a file using a given encoding
1121  * and formatting returns/spaces are added.
1122  *
1123  * returns: the number of byte written or -1 in case of failure.
1124  */
1125 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1126 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1127     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1128 }
1129 
1130 #endif /* LIBXML_OUTPUT_ENABLED */
1131 
1132 #endif /* LIBXML_HTML_ENABLED */
1133