• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17 
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25 
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/save.h"
30 
31 /************************************************************************
32  *									*
33  *		Getting/Setting encoding meta tags			*
34  *									*
35  ************************************************************************/
36 
37 /**
38  * htmlGetMetaEncoding:
39  * @doc:  the document
40  *
41  * Encoding definition lookup in the Meta tags
42  *
43  * Returns the current encoding as flagged in the HTML source
44  */
45 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)46 htmlGetMetaEncoding(htmlDocPtr doc) {
47     htmlNodePtr cur;
48     const xmlChar *content;
49     const xmlChar *encoding;
50 
51     if (doc == NULL)
52 	return(NULL);
53     cur = doc->children;
54 
55     /*
56      * Search the html
57      */
58     while (cur != NULL) {
59 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 		break;
62 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 		goto found_head;
64 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 		goto found_meta;
66 	}
67 	cur = cur->next;
68     }
69     if (cur == NULL)
70 	return(NULL);
71     cur = cur->children;
72 
73     /*
74      * Search the head
75      */
76     while (cur != NULL) {
77 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 		break;
80 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 		goto found_meta;
82 	}
83 	cur = cur->next;
84     }
85     if (cur == NULL)
86 	return(NULL);
87 found_head:
88     cur = cur->children;
89 
90     /*
91      * Search the meta elements
92      */
93 found_meta:
94     while (cur != NULL) {
95 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 		xmlAttrPtr attr = cur->properties;
98 		int http;
99 		const xmlChar *value;
100 
101 		content = NULL;
102 		http = 0;
103 		while (attr != NULL) {
104 		    if ((attr->children != NULL) &&
105 		        (attr->children->type == XML_TEXT_NODE) &&
106 		        (attr->children->next == NULL)) {
107 			value = attr->children->content;
108 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 			    http = 1;
111 			else if ((value != NULL)
112 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 			    content = value;
114 			if ((http != 0) && (content != NULL))
115 			    goto found_content;
116 		    }
117 		    attr = attr->next;
118 		}
119 	    }
120 	}
121 	cur = cur->next;
122     }
123     return(NULL);
124 
125 found_content:
126     encoding = xmlStrstr(content, BAD_CAST"charset=");
127     if (encoding == NULL)
128 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
129     if (encoding == NULL)
130 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131     if (encoding != NULL) {
132 	encoding += 8;
133     } else {
134 	encoding = xmlStrstr(content, BAD_CAST"charset =");
135 	if (encoding == NULL)
136 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 	if (encoding == NULL)
138 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 	if (encoding != NULL)
140 	    encoding += 9;
141     }
142     if (encoding != NULL) {
143 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144     }
145     return(encoding);
146 }
147 
148 /**
149  * htmlSetMetaEncoding:
150  * @doc:  the document
151  * @encoding:  the encoding string
152  *
153  * Sets the current encoding in the Meta tags
154  * NOTE: this will not change the document content encoding, just
155  * the META flag associated.
156  *
157  * Returns 0 in case of success and -1 in case of error
158  */
159 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)160 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161     htmlNodePtr cur, meta = NULL, head = NULL;
162     const xmlChar *content = NULL;
163     char newcontent[100];
164 
165     newcontent[0] = 0;
166 
167     if (doc == NULL)
168 	return(-1);
169 
170     /* html isn't a real encoding it's just libxml2 way to get entities */
171     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172         return(-1);
173 
174     if (encoding != NULL) {
175 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176                 (char *)encoding);
177 	newcontent[sizeof(newcontent) - 1] = 0;
178     }
179 
180     cur = doc->children;
181 
182     /*
183      * Search the html
184      */
185     while (cur != NULL) {
186 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 		break;
189 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 		goto found_head;
191 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 		goto found_meta;
193 	}
194 	cur = cur->next;
195     }
196     if (cur == NULL)
197 	return(-1);
198     cur = cur->children;
199 
200     /*
201      * Search the head
202      */
203     while (cur != NULL) {
204 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 		break;
207 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208                 head = cur->parent;
209 		goto found_meta;
210             }
211 	}
212 	cur = cur->next;
213     }
214     if (cur == NULL)
215 	return(-1);
216 found_head:
217     head = cur;
218     if (cur->children == NULL)
219         goto create;
220     cur = cur->children;
221 
222 found_meta:
223     /*
224      * Search and update all the remaining the meta elements carrying
225      * encoding information
226      */
227     while (cur != NULL) {
228 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 		xmlAttrPtr attr = cur->properties;
231 		int http;
232 		const xmlChar *value;
233 
234 		content = NULL;
235 		http = 0;
236 		while (attr != NULL) {
237 		    if ((attr->children != NULL) &&
238 		        (attr->children->type == XML_TEXT_NODE) &&
239 		        (attr->children->next == NULL)) {
240 			value = attr->children->content;
241 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 			    http = 1;
244 			else
245                         {
246                            if ((value != NULL) &&
247                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 			       content = value;
249                         }
250 		        if ((http != 0) && (content != NULL))
251 			    break;
252 		    }
253 		    attr = attr->next;
254 		}
255 		if ((http != 0) && (content != NULL)) {
256 		    meta = cur;
257 		    break;
258 		}
259 
260 	    }
261 	}
262 	cur = cur->next;
263     }
264 create:
265     if (meta == NULL) {
266         if ((encoding != NULL) && (head != NULL)) {
267             /*
268              * Create a new Meta element with the right attributes
269              */
270 
271             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272             if (head->children == NULL)
273                 xmlAddChild(head, meta);
274             else
275                 xmlAddPrevSibling(head->children, meta);
276             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278         }
279     } else {
280         /* remove the meta tag if NULL is passed */
281         if (encoding == NULL) {
282             xmlUnlinkNode(meta);
283             xmlFreeNode(meta);
284         }
285         /* change the document only if there is a real encoding change */
286         else if (xmlStrcasestr(content, encoding) == NULL) {
287             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288         }
289     }
290 
291 
292     return(0);
293 }
294 
295 /**
296  * booleanHTMLAttrs:
297  *
298  * These are the HTML attributes which will be output
299  * in minimized form, i.e. <option selected="selected"> will be
300  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301  *
302  */
303 static const char* const htmlBooleanAttrs[] = {
304   "checked", "compact", "declare", "defer", "disabled", "ismap",
305   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306   "selected", NULL
307 };
308 
309 
310 /**
311  * htmlIsBooleanAttr:
312  * @name:  the name of the attribute to check
313  *
314  * Determine if a given attribute is a boolean attribute.
315  *
316  * returns: false if the attribute is not boolean, true otherwise.
317  */
318 int
htmlIsBooleanAttr(const xmlChar * name)319 htmlIsBooleanAttr(const xmlChar *name)
320 {
321     int i = 0;
322 
323     while (htmlBooleanAttrs[i] != NULL) {
324         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325             return 1;
326         i++;
327     }
328     return 0;
329 }
330 
331 #ifdef LIBXML_OUTPUT_ENABLED
332 /************************************************************************
333  *									*
334  *			Output error handlers				*
335  *									*
336  ************************************************************************/
337 /**
338  * htmlSaveErrMemory:
339  * @extra:  extra information
340  *
341  * Handle an out of memory condition
342  */
343 static void
htmlSaveErrMemory(const char * extra)344 htmlSaveErrMemory(const char *extra)
345 {
346     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
347 }
348 
349 /**
350  * htmlSaveErr:
351  * @code:  the error number
352  * @node:  the location of the error.
353  * @extra:  extra information
354  *
355  * Handle an out of memory condition
356  */
357 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)358 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
359 {
360     const char *msg = NULL;
361 
362     switch(code) {
363         case XML_SAVE_NOT_UTF8:
364 	    msg = "string is not in UTF-8\n";
365 	    break;
366 	case XML_SAVE_CHAR_INVALID:
367 	    msg = "invalid character value\n";
368 	    break;
369 	case XML_SAVE_UNKNOWN_ENCODING:
370 	    msg = "unknown encoding %s\n";
371 	    break;
372 	case XML_SAVE_NO_DOCTYPE:
373 	    msg = "HTML has no DOCTYPE\n";
374 	    break;
375 	default:
376 	    msg = "unexpected error number\n";
377     }
378     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
379 }
380 
381 /************************************************************************
382  *									*
383  *		Dumping HTML tree content to a simple buffer		*
384  *									*
385  ************************************************************************/
386 
387 /**
388  * htmlBufNodeDumpFormat:
389  * @buf:  the xmlBufPtr output
390  * @doc:  the document
391  * @cur:  the current node
392  * @format:  should formatting spaces been added
393  *
394  * Dump an HTML node, recursive behaviour,children are printed too.
395  *
396  * Returns the number of byte written or -1 in case of error
397  */
398 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)399 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
400 	           int format) {
401     size_t use;
402     int ret;
403     xmlOutputBufferPtr outbuf;
404 
405     if (cur == NULL) {
406 	return (-1);
407     }
408     if (buf == NULL) {
409 	return (-1);
410     }
411     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
412     if (outbuf == NULL) {
413         htmlSaveErrMemory("allocating HTML output buffer");
414 	return (-1);
415     }
416     memset(outbuf, 0, sizeof(xmlOutputBuffer));
417     outbuf->buffer = buf;
418     outbuf->encoder = NULL;
419     outbuf->writecallback = NULL;
420     outbuf->closecallback = NULL;
421     outbuf->context = NULL;
422     outbuf->written = 0;
423 
424     use = xmlBufUse(buf);
425     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
426     xmlFree(outbuf);
427     ret = xmlBufUse(buf) - use;
428     return (ret);
429 }
430 
431 /**
432  * htmlNodeDump:
433  * @buf:  the HTML buffer output
434  * @doc:  the document
435  * @cur:  the current node
436  *
437  * Dump an HTML node, recursive behaviour,children are printed too,
438  * and formatting returns are added.
439  *
440  * Returns the number of byte written or -1 in case of error
441  */
442 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)443 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
444     xmlBufPtr buffer;
445     size_t ret;
446 
447     if ((buf == NULL) || (cur == NULL))
448         return(-1);
449 
450     xmlInitParser();
451     buffer = xmlBufFromBuffer(buf);
452     if (buffer == NULL)
453         return(-1);
454 
455     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
456 
457     xmlBufBackToBuffer(buffer);
458 
459     if (ret > INT_MAX)
460         return(-1);
461     return((int) ret);
462 }
463 
464 /**
465  * htmlNodeDumpFileFormat:
466  * @out:  the FILE pointer
467  * @doc:  the document
468  * @cur:  the current node
469  * @encoding: the document encoding
470  * @format:  should formatting spaces been added
471  *
472  * Dump an HTML node, recursive behaviour,children are printed too.
473  *
474  * TODO: if encoding == NULL try to save in the doc encoding
475  *
476  * returns: the number of byte written or -1 in case of failure.
477  */
478 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)479 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
480 	               xmlNodePtr cur, const char *encoding, int format) {
481     xmlOutputBufferPtr buf;
482     xmlCharEncodingHandlerPtr handler = NULL;
483     int ret;
484 
485     xmlInitParser();
486 
487     if (encoding != NULL) {
488 	xmlCharEncoding enc;
489 
490 	enc = xmlParseCharEncoding(encoding);
491 	if (enc != XML_CHAR_ENCODING_UTF8) {
492 	    handler = xmlFindCharEncodingHandler(encoding);
493 	    if (handler == NULL)
494 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
495 	}
496     } else {
497         /*
498          * Fallback to HTML or ASCII when the encoding is unspecified
499          */
500         if (handler == NULL)
501             handler = xmlFindCharEncodingHandler("HTML");
502         if (handler == NULL)
503             handler = xmlFindCharEncodingHandler("ascii");
504     }
505 
506     /*
507      * save the content to a temp buffer.
508      */
509     buf = xmlOutputBufferCreateFile(out, handler);
510     if (buf == NULL) return(0);
511 
512     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
513 
514     ret = xmlOutputBufferClose(buf);
515     return(ret);
516 }
517 
518 /**
519  * htmlNodeDumpFile:
520  * @out:  the FILE pointer
521  * @doc:  the document
522  * @cur:  the current node
523  *
524  * Dump an HTML node, recursive behaviour,children are printed too,
525  * and formatting returns are added.
526  */
527 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)528 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
529     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
530 }
531 
532 /**
533  * htmlDocDumpMemoryFormat:
534  * @cur:  the document
535  * @mem:  OUT: the memory pointer
536  * @size:  OUT: the memory length
537  * @format:  should formatting spaces been added
538  *
539  * Dump an HTML document in memory and return the xmlChar * and it's size.
540  * It's up to the caller to free the memory.
541  */
542 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)543 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
544     xmlOutputBufferPtr buf;
545     xmlCharEncodingHandlerPtr handler = NULL;
546     const char *encoding;
547 
548     xmlInitParser();
549 
550     if ((mem == NULL) || (size == NULL))
551         return;
552     if (cur == NULL) {
553 	*mem = NULL;
554 	*size = 0;
555 	return;
556     }
557 
558     encoding = (const char *) htmlGetMetaEncoding(cur);
559 
560     if (encoding != NULL) {
561 	xmlCharEncoding enc;
562 
563 	enc = xmlParseCharEncoding(encoding);
564 	if (enc != XML_CHAR_ENCODING_UTF8) {
565 	    handler = xmlFindCharEncodingHandler(encoding);
566 	    if (handler == NULL)
567                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
568 
569 	}
570     } else {
571         /*
572          * Fallback to HTML or ASCII when the encoding is unspecified
573          */
574         if (handler == NULL)
575             handler = xmlFindCharEncodingHandler("HTML");
576         if (handler == NULL)
577             handler = xmlFindCharEncodingHandler("ascii");
578     }
579 
580     buf = xmlAllocOutputBufferInternal(handler);
581     if (buf == NULL) {
582 	*mem = NULL;
583 	*size = 0;
584 	return;
585     }
586 
587     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
588 
589     xmlOutputBufferFlush(buf);
590     if (buf->conv != NULL) {
591 	*size = xmlBufUse(buf->conv);
592 	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
593     } else {
594 	*size = xmlBufUse(buf->buffer);
595 	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
596     }
597     (void)xmlOutputBufferClose(buf);
598 }
599 
600 /**
601  * htmlDocDumpMemory:
602  * @cur:  the document
603  * @mem:  OUT: the memory pointer
604  * @size:  OUT: the memory length
605  *
606  * Dump an HTML document in memory and return the xmlChar * and it's size.
607  * It's up to the caller to free the memory.
608  */
609 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)610 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
611 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
612 }
613 
614 
615 /************************************************************************
616  *									*
617  *		Dumping HTML tree content to an I/O output buffer	*
618  *									*
619  ************************************************************************/
620 
621 /**
622  * htmlDtdDumpOutput:
623  * @buf:  the HTML buffer output
624  * @doc:  the document
625  * @encoding:  the encoding string
626  *
627  * TODO: check whether encoding is needed
628  *
629  * Dump the HTML document DTD, if any.
630  */
631 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)632 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
633 	          const char *encoding ATTRIBUTE_UNUSED) {
634     xmlDtdPtr cur = doc->intSubset;
635 
636     if (cur == NULL) {
637 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
638 	return;
639     }
640     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
641     xmlOutputBufferWriteString(buf, (const char *)cur->name);
642     if (cur->ExternalID != NULL) {
643 	xmlOutputBufferWriteString(buf, " PUBLIC ");
644 	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
645 	if (cur->SystemID != NULL) {
646 	    xmlOutputBufferWriteString(buf, " ");
647 	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
648 	}
649     } else if (cur->SystemID != NULL &&
650 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
651 	xmlOutputBufferWriteString(buf, " SYSTEM ");
652 	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
653     }
654     xmlOutputBufferWriteString(buf, ">\n");
655 }
656 
657 /**
658  * htmlAttrDumpOutput:
659  * @buf:  the HTML buffer output
660  * @doc:  the document
661  * @cur:  the attribute pointer
662  *
663  * Dump an HTML attribute
664  */
665 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)666 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
667     xmlChar *value;
668 
669     /*
670      * The html output method should not escape a & character
671      * occurring in an attribute value immediately followed by
672      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
673      * This is implemented in xmlEncodeEntitiesReentrant
674      */
675 
676     if (cur == NULL) {
677 	return;
678     }
679     xmlOutputBufferWriteString(buf, " ");
680     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
681         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
682 	xmlOutputBufferWriteString(buf, ":");
683     }
684     xmlOutputBufferWriteString(buf, (const char *)cur->name);
685     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
686 	value = xmlNodeListGetString(doc, cur->children, 0);
687 	if (value) {
688 	    xmlOutputBufferWriteString(buf, "=");
689 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
690 		(cur->parent->ns == NULL) &&
691 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
692 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
693 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
694 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
695 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
696 		xmlChar *escaped;
697 		xmlChar *tmp = value;
698 
699 		while (IS_BLANK_CH(*tmp)) tmp++;
700 
701 		/*
702                  * Angle brackets are technically illegal in URIs, but they're
703                  * used in server side includes, for example. Curly brackets
704                  * are illegal as well and often used in templates.
705                  * Don't escape non-whitespace, printable ASCII chars for
706                  * improved interoperability. Only escape space, control
707                  * and non-ASCII chars.
708 		 */
709 		escaped = xmlURIEscapeStr(tmp,
710                         BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
711 		if (escaped != NULL) {
712 		    xmlBufWriteQuotedString(buf->buffer, escaped);
713 		    xmlFree(escaped);
714 		} else {
715 		    xmlBufWriteQuotedString(buf->buffer, value);
716 		}
717 	    } else {
718 		xmlBufWriteQuotedString(buf->buffer, value);
719 	    }
720 	    xmlFree(value);
721 	} else  {
722 	    xmlOutputBufferWriteString(buf, "=\"\"");
723 	}
724     }
725 }
726 
727 /**
728  * htmlNodeDumpFormatOutput:
729  * @buf:  the HTML buffer output
730  * @doc:  the document
731  * @cur:  the current node
732  * @encoding:  the encoding string (unused)
733  * @format:  should formatting spaces been added
734  *
735  * Dump an HTML node, recursive behaviour,children are printed too.
736  */
737 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)738 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739 	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
740                          int format) {
741     xmlNodePtr root, parent;
742     xmlAttrPtr attr;
743     const htmlElemDesc * info;
744 
745     xmlInitParser();
746 
747     if ((cur == NULL) || (buf == NULL)) {
748 	return;
749     }
750 
751     root = cur;
752     parent = cur->parent;
753     while (1) {
754         switch (cur->type) {
755         case XML_HTML_DOCUMENT_NODE:
756         case XML_DOCUMENT_NODE:
757             if (((xmlDocPtr) cur)->intSubset != NULL) {
758                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
759             }
760             if (cur->children != NULL) {
761                 /* Always validate cur->parent when descending. */
762                 if (cur->parent == parent) {
763                     parent = cur;
764                     cur = cur->children;
765                     continue;
766                 }
767             } else {
768                 xmlOutputBufferWriteString(buf, "\n");
769             }
770             break;
771 
772         case XML_ELEMENT_NODE:
773             /*
774              * Some users like lxml are known to pass nodes with a corrupted
775              * tree structure. Fall back to a recursive call to handle this
776              * case.
777              */
778             if ((cur->parent != parent) && (cur->children != NULL)) {
779                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
780                 break;
781             }
782 
783             /*
784              * Get specific HTML info for that node.
785              */
786             if (cur->ns == NULL)
787                 info = htmlTagLookup(cur->name);
788             else
789                 info = NULL;
790 
791             xmlOutputBufferWriteString(buf, "<");
792             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
793                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
794                 xmlOutputBufferWriteString(buf, ":");
795             }
796             xmlOutputBufferWriteString(buf, (const char *)cur->name);
797             if (cur->nsDef)
798                 xmlNsListDumpOutput(buf, cur->nsDef);
799             attr = cur->properties;
800             while (attr != NULL) {
801                 htmlAttrDumpOutput(buf, doc, attr);
802                 attr = attr->next;
803             }
804 
805             if ((info != NULL) && (info->empty)) {
806                 xmlOutputBufferWriteString(buf, ">");
807             } else if (cur->children == NULL) {
808                 if ((info != NULL) && (info->saveEndTag != 0) &&
809                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
810                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
811                     xmlOutputBufferWriteString(buf, ">");
812                 } else {
813                     xmlOutputBufferWriteString(buf, "></");
814                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
815                         xmlOutputBufferWriteString(buf,
816                                 (const char *)cur->ns->prefix);
817                         xmlOutputBufferWriteString(buf, ":");
818                     }
819                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
820                     xmlOutputBufferWriteString(buf, ">");
821                 }
822             } else {
823                 xmlOutputBufferWriteString(buf, ">");
824                 if ((format) && (info != NULL) && (!info->isinline) &&
825                     (cur->children->type != HTML_TEXT_NODE) &&
826                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
827                     (cur->children != cur->last) &&
828                     (cur->name != NULL) &&
829                     (cur->name[0] != 'p')) /* p, pre, param */
830                     xmlOutputBufferWriteString(buf, "\n");
831                 parent = cur;
832                 cur = cur->children;
833                 continue;
834             }
835 
836             if ((format) && (cur->next != NULL) &&
837                 (info != NULL) && (!info->isinline)) {
838                 if ((cur->next->type != HTML_TEXT_NODE) &&
839                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
840                     (parent != NULL) &&
841                     (parent->name != NULL) &&
842                     (parent->name[0] != 'p')) /* p, pre, param */
843                     xmlOutputBufferWriteString(buf, "\n");
844             }
845 
846             break;
847 
848         case XML_ATTRIBUTE_NODE:
849             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
850             break;
851 
852         case HTML_TEXT_NODE:
853             if (cur->content == NULL)
854                 break;
855             if (((cur->name == (const xmlChar *)xmlStringText) ||
856                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
857                 ((parent == NULL) ||
858                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
859                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
860                 xmlChar *buffer;
861 
862                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
863                 if (buffer != NULL) {
864                     xmlOutputBufferWriteString(buf, (const char *)buffer);
865                     xmlFree(buffer);
866                 }
867             } else {
868                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
869             }
870             break;
871 
872         case HTML_COMMENT_NODE:
873             if (cur->content != NULL) {
874                 xmlOutputBufferWriteString(buf, "<!--");
875                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
876                 xmlOutputBufferWriteString(buf, "-->");
877             }
878             break;
879 
880         case HTML_PI_NODE:
881             if (cur->name != NULL) {
882                 xmlOutputBufferWriteString(buf, "<?");
883                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
884                 if (cur->content != NULL) {
885                     xmlOutputBufferWriteString(buf, " ");
886                     xmlOutputBufferWriteString(buf,
887                             (const char *)cur->content);
888                 }
889                 xmlOutputBufferWriteString(buf, ">");
890             }
891             break;
892 
893         case HTML_ENTITY_REF_NODE:
894             xmlOutputBufferWriteString(buf, "&");
895             xmlOutputBufferWriteString(buf, (const char *)cur->name);
896             xmlOutputBufferWriteString(buf, ";");
897             break;
898 
899         case HTML_PRESERVE_NODE:
900             if (cur->content != NULL) {
901                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
902             }
903             break;
904 
905         default:
906             break;
907         }
908 
909         while (1) {
910             if (cur == root)
911                 return;
912             if (cur->next != NULL) {
913                 cur = cur->next;
914                 break;
915             }
916 
917             cur = parent;
918             /* cur->parent was validated when descending. */
919             parent = cur->parent;
920 
921             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
922                 (cur->type == XML_DOCUMENT_NODE)) {
923                 xmlOutputBufferWriteString(buf, "\n");
924             } else {
925                 if ((format) && (cur->ns == NULL))
926                     info = htmlTagLookup(cur->name);
927                 else
928                     info = NULL;
929 
930                 if ((format) && (info != NULL) && (!info->isinline) &&
931                     (cur->last->type != HTML_TEXT_NODE) &&
932                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
933                     (cur->children != cur->last) &&
934                     (cur->name != NULL) &&
935                     (cur->name[0] != 'p')) /* p, pre, param */
936                     xmlOutputBufferWriteString(buf, "\n");
937 
938                 xmlOutputBufferWriteString(buf, "</");
939                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941                     xmlOutputBufferWriteString(buf, ":");
942                 }
943                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
944                 xmlOutputBufferWriteString(buf, ">");
945 
946                 if ((format) && (info != NULL) && (!info->isinline) &&
947                     (cur->next != NULL)) {
948                     if ((cur->next->type != HTML_TEXT_NODE) &&
949                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
950                         (parent != NULL) &&
951                         (parent->name != NULL) &&
952                         (parent->name[0] != 'p')) /* p, pre, param */
953                         xmlOutputBufferWriteString(buf, "\n");
954                 }
955             }
956         }
957     }
958 }
959 
960 /**
961  * htmlNodeDumpOutput:
962  * @buf:  the HTML buffer output
963  * @doc:  the document
964  * @cur:  the current node
965  * @encoding:  the encoding string (unused)
966  *
967  * Dump an HTML node, recursive behaviour,children are printed too,
968  * and formatting returns/spaces are added.
969  */
970 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)971 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
972 	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
973     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
974 }
975 
976 /**
977  * htmlDocContentDumpFormatOutput:
978  * @buf:  the HTML buffer output
979  * @cur:  the document
980  * @encoding:  the encoding string (unused)
981  * @format:  should formatting spaces been added
982  *
983  * Dump an HTML document.
984  */
985 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)986 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
987 	                       const char *encoding ATTRIBUTE_UNUSED,
988                                int format) {
989     int type = 0;
990     if (cur) {
991         type = cur->type;
992         cur->type = XML_HTML_DOCUMENT_NODE;
993     }
994     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
995     if (cur)
996         cur->type = (xmlElementType) type;
997 }
998 
999 /**
1000  * htmlDocContentDumpOutput:
1001  * @buf:  the HTML buffer output
1002  * @cur:  the document
1003  * @encoding:  the encoding string (unused)
1004  *
1005  * Dump an HTML document. Formatting return/spaces are added.
1006  */
1007 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)1008 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1009 	                 const char *encoding ATTRIBUTE_UNUSED) {
1010     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1011 }
1012 
1013 /************************************************************************
1014  *									*
1015  *		Saving functions front-ends				*
1016  *									*
1017  ************************************************************************/
1018 
1019 /**
1020  * htmlDocDump:
1021  * @f:  the FILE*
1022  * @cur:  the document
1023  *
1024  * Dump an HTML document to an open FILE.
1025  *
1026  * returns: the number of byte written or -1 in case of failure.
1027  */
1028 int
htmlDocDump(FILE * f,xmlDocPtr cur)1029 htmlDocDump(FILE *f, xmlDocPtr cur) {
1030     xmlOutputBufferPtr buf;
1031     xmlCharEncodingHandlerPtr handler = NULL;
1032     const char *encoding;
1033     int ret;
1034 
1035     xmlInitParser();
1036 
1037     if ((cur == NULL) || (f == NULL)) {
1038 	return(-1);
1039     }
1040 
1041     encoding = (const char *) htmlGetMetaEncoding(cur);
1042 
1043     if (encoding != NULL) {
1044 	xmlCharEncoding enc;
1045 
1046 	enc = xmlParseCharEncoding(encoding);
1047 	if (enc != XML_CHAR_ENCODING_UTF8) {
1048 	    handler = xmlFindCharEncodingHandler(encoding);
1049 	    if (handler == NULL)
1050 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1051 	}
1052     } else {
1053         /*
1054          * Fallback to HTML or ASCII when the encoding is unspecified
1055          */
1056         if (handler == NULL)
1057             handler = xmlFindCharEncodingHandler("HTML");
1058         if (handler == NULL)
1059             handler = xmlFindCharEncodingHandler("ascii");
1060     }
1061 
1062     buf = xmlOutputBufferCreateFile(f, handler);
1063     if (buf == NULL) return(-1);
1064     htmlDocContentDumpOutput(buf, cur, NULL);
1065 
1066     ret = xmlOutputBufferClose(buf);
1067     return(ret);
1068 }
1069 
1070 /**
1071  * htmlSaveFile:
1072  * @filename:  the filename (or URL)
1073  * @cur:  the document
1074  *
1075  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1076  * used.
1077  * returns: the number of byte written or -1 in case of failure.
1078  */
1079 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1080 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1081     xmlOutputBufferPtr buf;
1082     xmlCharEncodingHandlerPtr handler = NULL;
1083     const char *encoding;
1084     int ret;
1085 
1086     if ((cur == NULL) || (filename == NULL))
1087         return(-1);
1088 
1089     xmlInitParser();
1090 
1091     encoding = (const char *) htmlGetMetaEncoding(cur);
1092 
1093     if (encoding != NULL) {
1094 	xmlCharEncoding enc;
1095 
1096 	enc = xmlParseCharEncoding(encoding);
1097 	if (enc != XML_CHAR_ENCODING_UTF8) {
1098 	    handler = xmlFindCharEncodingHandler(encoding);
1099 	    if (handler == NULL)
1100 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1101 	}
1102     } else {
1103         /*
1104          * Fallback to HTML or ASCII when the encoding is unspecified
1105          */
1106         if (handler == NULL)
1107             handler = xmlFindCharEncodingHandler("HTML");
1108         if (handler == NULL)
1109             handler = xmlFindCharEncodingHandler("ascii");
1110     }
1111 
1112     /*
1113      * save the content to a temp buffer.
1114      */
1115     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1116     if (buf == NULL) return(0);
1117 
1118     htmlDocContentDumpOutput(buf, cur, NULL);
1119 
1120     ret = xmlOutputBufferClose(buf);
1121     return(ret);
1122 }
1123 
1124 /**
1125  * htmlSaveFileFormat:
1126  * @filename:  the filename
1127  * @cur:  the document
1128  * @format:  should formatting spaces been added
1129  * @encoding: the document encoding
1130  *
1131  * Dump an HTML document to a file using a given encoding.
1132  *
1133  * returns: the number of byte written or -1 in case of failure.
1134  */
1135 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1136 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1137 	           const char *encoding, int format) {
1138     xmlOutputBufferPtr buf;
1139     xmlCharEncodingHandlerPtr handler = NULL;
1140     int ret;
1141 
1142     if ((cur == NULL) || (filename == NULL))
1143         return(-1);
1144 
1145     xmlInitParser();
1146 
1147     if (encoding != NULL) {
1148 	xmlCharEncoding enc;
1149 
1150 	enc = xmlParseCharEncoding(encoding);
1151 	if (enc != XML_CHAR_ENCODING_UTF8) {
1152 	    handler = xmlFindCharEncodingHandler(encoding);
1153 	    if (handler == NULL)
1154 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1155 	}
1156         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1157     } else {
1158 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1159 
1160         /*
1161          * Fallback to HTML or ASCII when the encoding is unspecified
1162          */
1163         if (handler == NULL)
1164             handler = xmlFindCharEncodingHandler("HTML");
1165         if (handler == NULL)
1166             handler = xmlFindCharEncodingHandler("ascii");
1167     }
1168 
1169     /*
1170      * save the content to a temp buffer.
1171      */
1172     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1173     if (buf == NULL) return(0);
1174 
1175     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1176 
1177     ret = xmlOutputBufferClose(buf);
1178     return(ret);
1179 }
1180 
1181 /**
1182  * htmlSaveFileEnc:
1183  * @filename:  the filename
1184  * @cur:  the document
1185  * @encoding: the document encoding
1186  *
1187  * Dump an HTML document to a file using a given encoding
1188  * and formatting returns/spaces are added.
1189  *
1190  * returns: the number of byte written or -1 in case of failure.
1191  */
1192 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1193 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1194     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1195 }
1196 
1197 #endif /* LIBXML_OUTPUT_ENABLED */
1198 
1199 #endif /* LIBXML_HTML_ENABLED */
1200