• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17 
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25 
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/save.h"
30 
31 /************************************************************************
32  *									*
33  *		Getting/Setting encoding meta tags			*
34  *									*
35  ************************************************************************/
36 
37 /**
38  * htmlGetMetaEncoding:
39  * @doc:  the document
40  *
41  * Encoding definition lookup in the Meta tags
42  *
43  * Returns the current encoding as flagged in the HTML source
44  */
45 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)46 htmlGetMetaEncoding(htmlDocPtr doc) {
47     htmlNodePtr cur;
48     const xmlChar *content;
49     const xmlChar *encoding;
50 
51     if (doc == NULL)
52 	return(NULL);
53     cur = doc->children;
54 
55     /*
56      * Search the html
57      */
58     while (cur != NULL) {
59 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 		break;
62 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 		goto found_head;
64 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 		goto found_meta;
66 	}
67 	cur = cur->next;
68     }
69     if (cur == NULL)
70 	return(NULL);
71     cur = cur->children;
72 
73     /*
74      * Search the head
75      */
76     while (cur != NULL) {
77 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 		break;
80 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 		goto found_meta;
82 	}
83 	cur = cur->next;
84     }
85     if (cur == NULL)
86 	return(NULL);
87 found_head:
88     cur = cur->children;
89 
90     /*
91      * Search the meta elements
92      */
93 found_meta:
94     while (cur != NULL) {
95 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 		xmlAttrPtr attr = cur->properties;
98 		int http;
99 		const xmlChar *value;
100 
101 		content = NULL;
102 		http = 0;
103 		while (attr != NULL) {
104 		    if ((attr->children != NULL) &&
105 		        (attr->children->type == XML_TEXT_NODE) &&
106 		        (attr->children->next == NULL)) {
107 			value = attr->children->content;
108 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 			    http = 1;
111 			else if ((value != NULL)
112 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 			    content = value;
114 			if ((http != 0) && (content != NULL))
115 			    goto found_content;
116 		    }
117 		    attr = attr->next;
118 		}
119 	    }
120 	}
121 	cur = cur->next;
122     }
123     return(NULL);
124 
125 found_content:
126     encoding = xmlStrstr(content, BAD_CAST"charset=");
127     if (encoding == NULL)
128 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
129     if (encoding == NULL)
130 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131     if (encoding != NULL) {
132 	encoding += 8;
133     } else {
134 	encoding = xmlStrstr(content, BAD_CAST"charset =");
135 	if (encoding == NULL)
136 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 	if (encoding == NULL)
138 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 	if (encoding != NULL)
140 	    encoding += 9;
141     }
142     if (encoding != NULL) {
143 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144     }
145     return(encoding);
146 }
147 
148 /**
149  * htmlSetMetaEncoding:
150  * @doc:  the document
151  * @encoding:  the encoding string
152  *
153  * Sets the current encoding in the Meta tags
154  * NOTE: this will not change the document content encoding, just
155  * the META flag associated.
156  *
157  * Returns 0 in case of success and -1 in case of error
158  */
159 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)160 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161     htmlNodePtr cur, meta = NULL, head = NULL;
162     const xmlChar *content = NULL;
163     char newcontent[100];
164 
165     newcontent[0] = 0;
166 
167     if (doc == NULL)
168 	return(-1);
169 
170     /* html isn't a real encoding it's just libxml2 way to get entities */
171     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172         return(-1);
173 
174     if (encoding != NULL) {
175 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176                 (char *)encoding);
177 	newcontent[sizeof(newcontent) - 1] = 0;
178     }
179 
180     cur = doc->children;
181 
182     /*
183      * Search the html
184      */
185     while (cur != NULL) {
186 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 		break;
189 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 		goto found_head;
191 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 		goto found_meta;
193 	}
194 	cur = cur->next;
195     }
196     if (cur == NULL)
197 	return(-1);
198     cur = cur->children;
199 
200     /*
201      * Search the head
202      */
203     while (cur != NULL) {
204 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 		break;
207 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208                 head = cur->parent;
209 		goto found_meta;
210             }
211 	}
212 	cur = cur->next;
213     }
214     if (cur == NULL)
215 	return(-1);
216 found_head:
217     head = cur;
218     if (cur->children == NULL)
219         goto create;
220     cur = cur->children;
221 
222 found_meta:
223     /*
224      * Search and update all the remaining the meta elements carrying
225      * encoding information
226      */
227     while (cur != NULL) {
228 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 		xmlAttrPtr attr = cur->properties;
231 		int http;
232 		const xmlChar *value;
233 
234 		content = NULL;
235 		http = 0;
236 		while (attr != NULL) {
237 		    if ((attr->children != NULL) &&
238 		        (attr->children->type == XML_TEXT_NODE) &&
239 		        (attr->children->next == NULL)) {
240 			value = attr->children->content;
241 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 			    http = 1;
244 			else
245                         {
246                            if ((value != NULL) &&
247                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 			       content = value;
249                         }
250 		        if ((http != 0) && (content != NULL))
251 			    break;
252 		    }
253 		    attr = attr->next;
254 		}
255 		if ((http != 0) && (content != NULL)) {
256 		    meta = cur;
257 		    break;
258 		}
259 
260 	    }
261 	}
262 	cur = cur->next;
263     }
264 create:
265     if (meta == NULL) {
266         if ((encoding != NULL) && (head != NULL)) {
267             /*
268              * Create a new Meta element with the right attributes
269              */
270 
271             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272             if (head->children == NULL)
273                 xmlAddChild(head, meta);
274             else
275                 xmlAddPrevSibling(head->children, meta);
276             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278         }
279     } else {
280         /* remove the meta tag if NULL is passed */
281         if (encoding == NULL) {
282             xmlUnlinkNode(meta);
283             xmlFreeNode(meta);
284         }
285         /* change the document only if there is a real encoding change */
286         else if (xmlStrcasestr(content, encoding) == NULL) {
287             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288         }
289     }
290 
291 
292     return(0);
293 }
294 
295 /**
296  * booleanHTMLAttrs:
297  *
298  * These are the HTML attributes which will be output
299  * in minimized form, i.e. <option selected="selected"> will be
300  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301  *
302  */
303 static const char* const htmlBooleanAttrs[] = {
304   "checked", "compact", "declare", "defer", "disabled", "ismap",
305   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306   "selected", NULL
307 };
308 
309 
310 /**
311  * htmlIsBooleanAttr:
312  * @name:  the name of the attribute to check
313  *
314  * Determine if a given attribute is a boolean attribute.
315  *
316  * returns: false if the attribute is not boolean, true otherwise.
317  */
318 int
htmlIsBooleanAttr(const xmlChar * name)319 htmlIsBooleanAttr(const xmlChar *name)
320 {
321     int i = 0;
322 
323     while (htmlBooleanAttrs[i] != NULL) {
324         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325             return 1;
326         i++;
327     }
328     return 0;
329 }
330 
331 #ifdef LIBXML_OUTPUT_ENABLED
332 /************************************************************************
333  *									*
334  *			Output error handlers				*
335  *									*
336  ************************************************************************/
337 
338 /**
339  * htmlSaveErr:
340  * @code:  the error number
341  * @node:  the location of the error.
342  * @extra:  extra information
343  *
344  * Handle an out of memory condition
345  */
346 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)347 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
348 {
349     const char *msg = NULL;
350     int res;
351 
352     switch(code) {
353         case XML_SAVE_NOT_UTF8:
354 	    msg = "string is not in UTF-8\n";
355 	    break;
356 	case XML_SAVE_CHAR_INVALID:
357 	    msg = "invalid character value\n";
358 	    break;
359 	case XML_SAVE_UNKNOWN_ENCODING:
360 	    msg = "unknown encoding %s\n";
361 	    break;
362 	case XML_SAVE_NO_DOCTYPE:
363 	    msg = "HTML has no DOCTYPE\n";
364 	    break;
365 	default:
366 	    msg = "unexpected error number\n";
367     }
368 
369     res = __xmlRaiseError(NULL, NULL, NULL, NULL, node,
370                           XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
371                           extra, NULL, NULL, 0, 0,
372                           msg, extra);
373     if (res < 0)
374         xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
375 }
376 
377 /************************************************************************
378  *									*
379  *		Dumping HTML tree content to a simple buffer		*
380  *									*
381  ************************************************************************/
382 
383 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)384 htmlFindOutputEncoder(const char *encoding) {
385     xmlCharEncodingHandler *handler = NULL;
386 
387     if (encoding != NULL) {
388 	xmlCharEncoding enc;
389 
390 	enc = xmlParseCharEncoding(encoding);
391 	if (enc != XML_CHAR_ENCODING_UTF8) {
392 	    xmlOpenCharEncodingHandler(encoding, /* output */ 1, &handler);
393 	    if (handler == NULL)
394 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
395 	}
396     } else {
397         /*
398          * Fallback to HTML or ASCII when the encoding is unspecified
399          */
400         if (handler == NULL)
401             xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
402         if (handler == NULL)
403             xmlOpenCharEncodingHandler("ascii", /* output */ 1, &handler);
404     }
405 
406     return(handler);
407 }
408 
409 /**
410  * htmlBufNodeDumpFormat:
411  * @buf:  the xmlBufPtr output
412  * @doc:  the document
413  * @cur:  the current node
414  * @format:  should formatting spaces been added
415  *
416  * Dump an HTML node, recursive behaviour,children are printed too.
417  *
418  * Returns the number of byte written or -1 in case of error
419  */
420 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)421 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
422 	           int format) {
423     size_t use;
424     size_t ret;
425     xmlOutputBufferPtr outbuf;
426 
427     if (cur == NULL) {
428 	return ((size_t) -1);
429     }
430     if (buf == NULL) {
431 	return ((size_t) -1);
432     }
433     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
434     if (outbuf == NULL)
435 	return ((size_t) -1);
436     memset(outbuf, 0, sizeof(xmlOutputBuffer));
437     outbuf->buffer = buf;
438     outbuf->encoder = NULL;
439     outbuf->writecallback = NULL;
440     outbuf->closecallback = NULL;
441     outbuf->context = NULL;
442     outbuf->written = 0;
443 
444     use = xmlBufUse(buf);
445     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
446     if (outbuf->error)
447         ret = (size_t) -1;
448     else
449         ret = xmlBufUse(buf) - use;
450     xmlFree(outbuf);
451     return (ret);
452 }
453 
454 /**
455  * htmlNodeDump:
456  * @buf:  the HTML buffer output
457  * @doc:  the document
458  * @cur:  the current node
459  *
460  * Dump an HTML node, recursive behaviour,children are printed too,
461  * and formatting returns are added.
462  *
463  * Returns the number of byte written or -1 in case of error
464  */
465 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)466 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
467     xmlBufPtr buffer;
468     size_t ret;
469 
470     if ((buf == NULL) || (cur == NULL))
471         return(-1);
472 
473     xmlInitParser();
474     buffer = xmlBufFromBuffer(buf);
475     if (buffer == NULL)
476         return(-1);
477 
478     xmlBufSetAllocationScheme(buffer, XML_BUFFER_ALLOC_DOUBLEIT);
479     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
480 
481     xmlBufBackToBuffer(buffer);
482 
483     if (ret > INT_MAX)
484         return(-1);
485     return((int) ret);
486 }
487 
488 /**
489  * htmlNodeDumpFileFormat:
490  * @out:  the FILE pointer
491  * @doc:  the document
492  * @cur:  the current node
493  * @encoding: the document encoding
494  * @format:  should formatting spaces been added
495  *
496  * Dump an HTML node, recursive behaviour,children are printed too.
497  *
498  * TODO: if encoding == NULL try to save in the doc encoding
499  *
500  * returns: the number of byte written or -1 in case of failure.
501  */
502 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)503 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
504 	               xmlNodePtr cur, const char *encoding, int format) {
505     xmlOutputBufferPtr buf;
506     xmlCharEncodingHandlerPtr handler;
507     int ret;
508 
509     xmlInitParser();
510 
511     /*
512      * save the content to a temp buffer.
513      */
514     handler = htmlFindOutputEncoder(encoding);
515     buf = xmlOutputBufferCreateFile(out, handler);
516     if (buf == NULL) {
517         xmlCharEncCloseFunc(handler);
518         return(0);
519     }
520 
521     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
522 
523     ret = xmlOutputBufferClose(buf);
524     return(ret);
525 }
526 
527 /**
528  * htmlNodeDumpFile:
529  * @out:  the FILE pointer
530  * @doc:  the document
531  * @cur:  the current node
532  *
533  * Dump an HTML node, recursive behaviour,children are printed too,
534  * and formatting returns are added.
535  */
536 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
538     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
539 }
540 
541 /**
542  * htmlDocDumpMemoryFormat:
543  * @cur:  the document
544  * @mem:  OUT: the memory pointer
545  * @size:  OUT: the memory length
546  * @format:  should formatting spaces been added
547  *
548  * Dump an HTML document in memory and return the xmlChar * and it's size.
549  * It's up to the caller to free the memory.
550  */
551 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
553     xmlOutputBufferPtr buf;
554     xmlCharEncodingHandlerPtr handler = NULL;
555     const char *encoding;
556 
557     xmlInitParser();
558 
559     if ((mem == NULL) || (size == NULL))
560         return;
561     *mem = NULL;
562     *size = 0;
563     if (cur == NULL)
564 	return;
565 
566     encoding = (const char *) htmlGetMetaEncoding(cur);
567     handler = htmlFindOutputEncoder(encoding);
568     buf = xmlAllocOutputBufferInternal(handler);
569     if (buf == NULL) {
570         xmlCharEncCloseFunc(handler);
571 	return;
572     }
573 
574     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
575 
576     xmlOutputBufferFlush(buf);
577 
578     if (!buf->error) {
579         if (buf->conv != NULL) {
580             *size = xmlBufUse(buf->conv);
581             *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
582         } else {
583             *size = xmlBufUse(buf->buffer);
584             *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
585         }
586     }
587 
588     xmlOutputBufferClose(buf);
589 }
590 
591 /**
592  * htmlDocDumpMemory:
593  * @cur:  the document
594  * @mem:  OUT: the memory pointer
595  * @size:  OUT: the memory length
596  *
597  * Dump an HTML document in memory and return the xmlChar * and it's size.
598  * It's up to the caller to free the memory.
599  */
600 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)601 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
602 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
603 }
604 
605 
606 /************************************************************************
607  *									*
608  *		Dumping HTML tree content to an I/O output buffer	*
609  *									*
610  ************************************************************************/
611 
612 /**
613  * htmlDtdDumpOutput:
614  * @buf:  the HTML buffer output
615  * @doc:  the document
616  * @encoding:  the encoding string
617  *
618  * TODO: check whether encoding is needed
619  *
620  * Dump the HTML document DTD, if any.
621  */
622 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)623 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
624 	          const char *encoding ATTRIBUTE_UNUSED) {
625     xmlDtdPtr cur = doc->intSubset;
626 
627     if (cur == NULL) {
628 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
629 	return;
630     }
631     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
632     xmlOutputBufferWriteString(buf, (const char *)cur->name);
633     if (cur->ExternalID != NULL) {
634 	xmlOutputBufferWriteString(buf, " PUBLIC ");
635 	xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
636 	if (cur->SystemID != NULL) {
637 	    xmlOutputBufferWriteString(buf, " ");
638 	    xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
639 	}
640     } else if (cur->SystemID != NULL &&
641 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
642 	xmlOutputBufferWriteString(buf, " SYSTEM ");
643 	xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
644     }
645     xmlOutputBufferWriteString(buf, ">\n");
646 }
647 
648 /**
649  * htmlAttrDumpOutput:
650  * @buf:  the HTML buffer output
651  * @doc:  the document
652  * @cur:  the attribute pointer
653  *
654  * Dump an HTML attribute
655  */
656 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)657 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
658     xmlChar *value;
659 
660     /*
661      * The html output method should not escape a & character
662      * occurring in an attribute value immediately followed by
663      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
664      * This is implemented in xmlEncodeEntitiesReentrant
665      */
666 
667     if (cur == NULL) {
668 	return;
669     }
670     xmlOutputBufferWriteString(buf, " ");
671     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
672         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
673 	xmlOutputBufferWriteString(buf, ":");
674     }
675     xmlOutputBufferWriteString(buf, (const char *)cur->name);
676     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
677 	value = xmlNodeListGetString(doc, cur->children, 0);
678 	if (value) {
679 	    xmlOutputBufferWriteString(buf, "=");
680 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
681 		(cur->parent->ns == NULL) &&
682 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
683 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
684 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
685 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
686 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
687 		xmlChar *escaped;
688 		xmlChar *tmp = value;
689 
690 		while (IS_BLANK_CH(*tmp)) tmp++;
691 
692 		/*
693                  * Angle brackets are technically illegal in URIs, but they're
694                  * used in server side includes, for example. Curly brackets
695                  * are illegal as well and often used in templates.
696                  * Don't escape non-whitespace, printable ASCII chars for
697                  * improved interoperability. Only escape space, control
698                  * and non-ASCII chars.
699 		 */
700 		escaped = xmlURIEscapeStr(tmp,
701                         BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
702 		if (escaped != NULL) {
703 		    xmlOutputBufferWriteQuotedString(buf, escaped);
704 		    xmlFree(escaped);
705 		} else {
706                     buf->error = XML_ERR_NO_MEMORY;
707 		}
708 	    } else {
709 		xmlOutputBufferWriteQuotedString(buf, value);
710 	    }
711 	    xmlFree(value);
712 	} else  {
713             buf->error = XML_ERR_NO_MEMORY;
714 	}
715     }
716 }
717 
718 /**
719  * htmlNodeDumpFormatOutput:
720  * @buf:  the HTML buffer output
721  * @doc:  the document
722  * @cur:  the current node
723  * @encoding:  the encoding string (unused)
724  * @format:  should formatting spaces been added
725  *
726  * Dump an HTML node, recursive behaviour,children are printed too.
727  */
728 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)729 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
730 	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
731                          int format) {
732     xmlNodePtr root, parent;
733     xmlAttrPtr attr;
734     const htmlElemDesc * info;
735 
736     xmlInitParser();
737 
738     if ((cur == NULL) || (buf == NULL)) {
739 	return;
740     }
741 
742     root = cur;
743     parent = cur->parent;
744     while (1) {
745         switch (cur->type) {
746         case XML_HTML_DOCUMENT_NODE:
747         case XML_DOCUMENT_NODE:
748             if (((xmlDocPtr) cur)->intSubset != NULL) {
749                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
750             }
751             if (cur->children != NULL) {
752                 /* Always validate cur->parent when descending. */
753                 if (cur->parent == parent) {
754                     parent = cur;
755                     cur = cur->children;
756                     continue;
757                 }
758             } else {
759                 xmlOutputBufferWriteString(buf, "\n");
760             }
761             break;
762 
763         case XML_ELEMENT_NODE:
764             /*
765              * Some users like lxml are known to pass nodes with a corrupted
766              * tree structure. Fall back to a recursive call to handle this
767              * case.
768              */
769             if ((cur->parent != parent) && (cur->children != NULL)) {
770                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
771                 break;
772             }
773 
774             /*
775              * Get specific HTML info for that node.
776              */
777             if (cur->ns == NULL)
778                 info = htmlTagLookup(cur->name);
779             else
780                 info = NULL;
781 
782             xmlOutputBufferWriteString(buf, "<");
783             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
784                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
785                 xmlOutputBufferWriteString(buf, ":");
786             }
787             xmlOutputBufferWriteString(buf, (const char *)cur->name);
788             if (cur->nsDef)
789                 xmlNsListDumpOutput(buf, cur->nsDef);
790             attr = cur->properties;
791             while (attr != NULL) {
792                 htmlAttrDumpOutput(buf, doc, attr);
793                 attr = attr->next;
794             }
795 
796             if ((info != NULL) && (info->empty)) {
797                 xmlOutputBufferWriteString(buf, ">");
798             } else if (cur->children == NULL) {
799                 if ((info != NULL) && (info->saveEndTag != 0) &&
800                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
801                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
802                     xmlOutputBufferWriteString(buf, ">");
803                 } else {
804                     xmlOutputBufferWriteString(buf, "></");
805                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
806                         xmlOutputBufferWriteString(buf,
807                                 (const char *)cur->ns->prefix);
808                         xmlOutputBufferWriteString(buf, ":");
809                     }
810                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
811                     xmlOutputBufferWriteString(buf, ">");
812                 }
813             } else {
814                 xmlOutputBufferWriteString(buf, ">");
815                 if ((format) && (info != NULL) && (!info->isinline) &&
816                     (cur->children->type != HTML_TEXT_NODE) &&
817                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
818                     (cur->children != cur->last) &&
819                     (cur->name != NULL) &&
820                     (cur->name[0] != 'p')) /* p, pre, param */
821                     xmlOutputBufferWriteString(buf, "\n");
822                 parent = cur;
823                 cur = cur->children;
824                 continue;
825             }
826 
827             if ((format) && (cur->next != NULL) &&
828                 (info != NULL) && (!info->isinline)) {
829                 if ((cur->next->type != HTML_TEXT_NODE) &&
830                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
831                     (parent != NULL) &&
832                     (parent->name != NULL) &&
833                     (parent->name[0] != 'p')) /* p, pre, param */
834                     xmlOutputBufferWriteString(buf, "\n");
835             }
836 
837             break;
838 
839         case XML_ATTRIBUTE_NODE:
840             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
841             break;
842 
843         case HTML_TEXT_NODE:
844             if (cur->content == NULL)
845                 break;
846             if (((cur->name == (const xmlChar *)xmlStringText) ||
847                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
848                 ((parent == NULL) ||
849                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
850                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
851                 xmlChar *buffer;
852 
853                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
854                 if (buffer == NULL) {
855                     buf->error = XML_ERR_NO_MEMORY;
856                     return;
857                 }
858                 xmlOutputBufferWriteString(buf, (const char *)buffer);
859                 xmlFree(buffer);
860             } else {
861                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
862             }
863             break;
864 
865         case HTML_COMMENT_NODE:
866             if (cur->content != NULL) {
867                 xmlOutputBufferWriteString(buf, "<!--");
868                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
869                 xmlOutputBufferWriteString(buf, "-->");
870             }
871             break;
872 
873         case HTML_PI_NODE:
874             if (cur->name != NULL) {
875                 xmlOutputBufferWriteString(buf, "<?");
876                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
877                 if (cur->content != NULL) {
878                     xmlOutputBufferWriteString(buf, " ");
879                     xmlOutputBufferWriteString(buf,
880                             (const char *)cur->content);
881                 }
882                 xmlOutputBufferWriteString(buf, ">");
883             }
884             break;
885 
886         case HTML_ENTITY_REF_NODE:
887             xmlOutputBufferWriteString(buf, "&");
888             xmlOutputBufferWriteString(buf, (const char *)cur->name);
889             xmlOutputBufferWriteString(buf, ";");
890             break;
891 
892         case HTML_PRESERVE_NODE:
893             if (cur->content != NULL) {
894                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
895             }
896             break;
897 
898         default:
899             break;
900         }
901 
902         while (1) {
903             if (cur == root)
904                 return;
905             if (cur->next != NULL) {
906                 cur = cur->next;
907                 break;
908             }
909 
910             cur = parent;
911             /* cur->parent was validated when descending. */
912             parent = cur->parent;
913 
914             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
915                 (cur->type == XML_DOCUMENT_NODE)) {
916                 xmlOutputBufferWriteString(buf, "\n");
917             } else {
918                 if ((format) && (cur->ns == NULL))
919                     info = htmlTagLookup(cur->name);
920                 else
921                     info = NULL;
922 
923                 if ((format) && (info != NULL) && (!info->isinline) &&
924                     (cur->last->type != HTML_TEXT_NODE) &&
925                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
926                     (cur->children != cur->last) &&
927                     (cur->name != NULL) &&
928                     (cur->name[0] != 'p')) /* p, pre, param */
929                     xmlOutputBufferWriteString(buf, "\n");
930 
931                 xmlOutputBufferWriteString(buf, "</");
932                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
933                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
934                     xmlOutputBufferWriteString(buf, ":");
935                 }
936                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
937                 xmlOutputBufferWriteString(buf, ">");
938 
939                 if ((format) && (info != NULL) && (!info->isinline) &&
940                     (cur->next != NULL)) {
941                     if ((cur->next->type != HTML_TEXT_NODE) &&
942                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
943                         (parent != NULL) &&
944                         (parent->name != NULL) &&
945                         (parent->name[0] != 'p')) /* p, pre, param */
946                         xmlOutputBufferWriteString(buf, "\n");
947                 }
948             }
949         }
950     }
951 }
952 
953 /**
954  * htmlNodeDumpOutput:
955  * @buf:  the HTML buffer output
956  * @doc:  the document
957  * @cur:  the current node
958  * @encoding:  the encoding string (unused)
959  *
960  * Dump an HTML node, recursive behaviour,children are printed too,
961  * and formatting returns/spaces are added.
962  */
963 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)964 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
965 	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
966     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
967 }
968 
969 /**
970  * htmlDocContentDumpFormatOutput:
971  * @buf:  the HTML buffer output
972  * @cur:  the document
973  * @encoding:  the encoding string (unused)
974  * @format:  should formatting spaces been added
975  *
976  * Dump an HTML document.
977  */
978 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)979 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
980 	                       const char *encoding ATTRIBUTE_UNUSED,
981                                int format) {
982     int type = 0;
983     if (cur) {
984         type = cur->type;
985         cur->type = XML_HTML_DOCUMENT_NODE;
986     }
987     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
988     if (cur)
989         cur->type = (xmlElementType) type;
990 }
991 
992 /**
993  * htmlDocContentDumpOutput:
994  * @buf:  the HTML buffer output
995  * @cur:  the document
996  * @encoding:  the encoding string (unused)
997  *
998  * Dump an HTML document. Formatting return/spaces are added.
999  */
1000 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)1001 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1002 	                 const char *encoding ATTRIBUTE_UNUSED) {
1003     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1004 }
1005 
1006 /************************************************************************
1007  *									*
1008  *		Saving functions front-ends				*
1009  *									*
1010  ************************************************************************/
1011 
1012 /**
1013  * htmlDocDump:
1014  * @f:  the FILE*
1015  * @cur:  the document
1016  *
1017  * Dump an HTML document to an open FILE.
1018  *
1019  * returns: the number of byte written or -1 in case of failure.
1020  */
1021 int
htmlDocDump(FILE * f,xmlDocPtr cur)1022 htmlDocDump(FILE *f, xmlDocPtr cur) {
1023     xmlOutputBufferPtr buf;
1024     xmlCharEncodingHandlerPtr handler = NULL;
1025     const char *encoding;
1026     int ret;
1027 
1028     xmlInitParser();
1029 
1030     if ((cur == NULL) || (f == NULL)) {
1031 	return(-1);
1032     }
1033 
1034     encoding = (const char *) htmlGetMetaEncoding(cur);
1035     handler = htmlFindOutputEncoder(encoding);
1036     buf = xmlOutputBufferCreateFile(f, handler);
1037     if (buf == NULL) {
1038         xmlCharEncCloseFunc(handler);
1039         return(-1);
1040     }
1041     htmlDocContentDumpOutput(buf, cur, NULL);
1042 
1043     ret = xmlOutputBufferClose(buf);
1044     return(ret);
1045 }
1046 
1047 /**
1048  * htmlSaveFile:
1049  * @filename:  the filename (or URL)
1050  * @cur:  the document
1051  *
1052  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1053  * used.
1054  * returns: the number of byte written or -1 in case of failure.
1055  */
1056 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1057 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1058     xmlOutputBufferPtr buf;
1059     xmlCharEncodingHandlerPtr handler = NULL;
1060     const char *encoding;
1061     int ret;
1062 
1063     if ((cur == NULL) || (filename == NULL))
1064         return(-1);
1065 
1066     xmlInitParser();
1067 
1068     encoding = (const char *) htmlGetMetaEncoding(cur);
1069     handler = htmlFindOutputEncoder(encoding);
1070     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1071     if (buf == NULL) {
1072         xmlCharEncCloseFunc(handler);
1073         return(0);
1074     }
1075 
1076     htmlDocContentDumpOutput(buf, cur, NULL);
1077 
1078     ret = xmlOutputBufferClose(buf);
1079     return(ret);
1080 }
1081 
1082 /**
1083  * htmlSaveFileFormat:
1084  * @filename:  the filename
1085  * @cur:  the document
1086  * @format:  should formatting spaces been added
1087  * @encoding: the document encoding
1088  *
1089  * Dump an HTML document to a file using a given encoding.
1090  *
1091  * returns: the number of byte written or -1 in case of failure.
1092  */
1093 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1094 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1095 	           const char *encoding, int format) {
1096     xmlOutputBufferPtr buf;
1097     xmlCharEncodingHandlerPtr handler = NULL;
1098     int ret;
1099 
1100     if ((cur == NULL) || (filename == NULL))
1101         return(-1);
1102 
1103     xmlInitParser();
1104 
1105     handler = htmlFindOutputEncoder(encoding);
1106     if (handler != NULL)
1107         htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1108     else
1109 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1110 
1111     /*
1112      * save the content to a temp buffer.
1113      */
1114     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1115     if (buf == NULL) {
1116         xmlCharEncCloseFunc(handler);
1117         return(0);
1118     }
1119 
1120     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1121 
1122     ret = xmlOutputBufferClose(buf);
1123     return(ret);
1124 }
1125 
1126 /**
1127  * htmlSaveFileEnc:
1128  * @filename:  the filename
1129  * @cur:  the document
1130  * @encoding: the document encoding
1131  *
1132  * Dump an HTML document to a file using a given encoding
1133  * and formatting returns/spaces are added.
1134  *
1135  * returns: the number of byte written or -1 in case of failure.
1136  */
1137 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1138 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1139     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1140 }
1141 
1142 #endif /* LIBXML_OUTPUT_ENABLED */
1143 
1144 #endif /* LIBXML_HTML_ENABLED */
1145