• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 
16 #ifdef HAVE_CTYPE_H
17 #include <ctype.h>
18 #endif
19 #ifdef HAVE_STDLIB_H
20 #include <stdlib.h>
21 #endif
22 
23 #include <libxml/xmlmemory.h>
24 #include <libxml/HTMLparser.h>
25 #include <libxml/HTMLtree.h>
26 #include <libxml/entities.h>
27 #include <libxml/valid.h>
28 #include <libxml/xmlerror.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/globals.h>
31 #include <libxml/uri.h>
32 
33 /************************************************************************
34  *									*
35  *   		Getting/Setting encoding meta tags			*
36  *									*
37  ************************************************************************/
38 
39 /**
40  * htmlGetMetaEncoding:
41  * @doc:  the document
42  *
43  * Encoding definition lookup in the Meta tags
44  *
45  * Returns the current encoding as flagged in the HTML source
46  */
47 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)48 htmlGetMetaEncoding(htmlDocPtr doc) {
49     htmlNodePtr cur;
50     const xmlChar *content;
51     const xmlChar *encoding;
52 
53     if (doc == NULL)
54 	return(NULL);
55     cur = doc->children;
56 
57     /*
58      * Search the html
59      */
60     while (cur != NULL) {
61 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
63 		break;
64 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
65 		goto found_head;
66 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67 		goto found_meta;
68 	}
69 	cur = cur->next;
70     }
71     if (cur == NULL)
72 	return(NULL);
73     cur = cur->children;
74 
75     /*
76      * Search the head
77      */
78     while (cur != NULL) {
79 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
81 		break;
82 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83 		goto found_meta;
84 	}
85 	cur = cur->next;
86     }
87     if (cur == NULL)
88 	return(NULL);
89 found_head:
90     cur = cur->children;
91 
92     /*
93      * Search the meta elements
94      */
95 found_meta:
96     while (cur != NULL) {
97 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99 		xmlAttrPtr attr = cur->properties;
100 		int http;
101 		const xmlChar *value;
102 
103 		content = NULL;
104 		http = 0;
105 		while (attr != NULL) {
106 		    if ((attr->children != NULL) &&
107 		        (attr->children->type == XML_TEXT_NODE) &&
108 		        (attr->children->next == NULL)) {
109 			value = attr->children->content;
110 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112 			    http = 1;
113 			else if ((value != NULL)
114 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115 			    content = value;
116 			if ((http != 0) && (content != NULL))
117 			    goto found_content;
118 		    }
119 		    attr = attr->next;
120 		}
121 	    }
122 	}
123 	cur = cur->next;
124     }
125     return(NULL);
126 
127 found_content:
128     encoding = xmlStrstr(content, BAD_CAST"charset=");
129     if (encoding == NULL)
130 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
131     if (encoding == NULL)
132 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133     if (encoding != NULL) {
134 	encoding += 8;
135     } else {
136 	encoding = xmlStrstr(content, BAD_CAST"charset =");
137 	if (encoding == NULL)
138 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
139 	if (encoding == NULL)
140 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141 	if (encoding != NULL)
142 	    encoding += 9;
143     }
144     if (encoding != NULL) {
145 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146     }
147     return(encoding);
148 }
149 
150 /**
151  * htmlSetMetaEncoding:
152  * @doc:  the document
153  * @encoding:  the encoding string
154  *
155  * Sets the current encoding in the Meta tags
156  * NOTE: this will not change the document content encoding, just
157  * the META flag associated.
158  *
159  * Returns 0 in case of success and -1 in case of error
160  */
161 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)162 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163     htmlNodePtr cur, meta = NULL, head = NULL;
164     const xmlChar *content = NULL;
165     char newcontent[100];
166 
167 
168     if (doc == NULL)
169 	return(-1);
170 
171     /* html isn't a real encoding it's just libxml2 way to get entities */
172     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
173         return(-1);
174 
175     if (encoding != NULL) {
176 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
177                 (char *)encoding);
178 	newcontent[sizeof(newcontent) - 1] = 0;
179     }
180 
181     cur = doc->children;
182 
183     /*
184      * Search the html
185      */
186     while (cur != NULL) {
187 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
188 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
189 		break;
190 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
191 		goto found_head;
192 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
193 		goto found_meta;
194 	}
195 	cur = cur->next;
196     }
197     if (cur == NULL)
198 	return(-1);
199     cur = cur->children;
200 
201     /*
202      * Search the head
203      */
204     while (cur != NULL) {
205 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
206 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
207 		break;
208 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
209                 head = cur->parent;
210 		goto found_meta;
211             }
212 	}
213 	cur = cur->next;
214     }
215     if (cur == NULL)
216 	return(-1);
217 found_head:
218     head = cur;
219     if (cur->children == NULL)
220         goto create;
221     cur = cur->children;
222 
223 found_meta:
224     /*
225      * Search and update all the remaining the meta elements carrying
226      * encoding informations
227      */
228     while (cur != NULL) {
229 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
230 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
231 		xmlAttrPtr attr = cur->properties;
232 		int http;
233 		const xmlChar *value;
234 
235 		content = NULL;
236 		http = 0;
237 		while (attr != NULL) {
238 		    if ((attr->children != NULL) &&
239 		        (attr->children->type == XML_TEXT_NODE) &&
240 		        (attr->children->next == NULL)) {
241 			value = attr->children->content;
242 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
243 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
244 			    http = 1;
245 			else
246                         {
247                            if ((value != NULL) &&
248                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
249 			       content = value;
250                         }
251 		        if ((http != 0) && (content != NULL))
252 			    break;
253 		    }
254 		    attr = attr->next;
255 		}
256 		if ((http != 0) && (content != NULL)) {
257 		    meta = cur;
258 		    break;
259 		}
260 
261 	    }
262 	}
263 	cur = cur->next;
264     }
265 create:
266     if (meta == NULL) {
267         if ((encoding != NULL) && (head != NULL)) {
268             /*
269              * Create a new Meta element with the right attributes
270              */
271 
272             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
273             if (head->children == NULL)
274                 xmlAddChild(head, meta);
275             else
276                 xmlAddPrevSibling(head->children, meta);
277             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
278             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
279         }
280     } else {
281         /* change the document only if there is a real encoding change */
282         if (xmlStrcasestr(content, encoding) == NULL) {
283             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
284         }
285     }
286 
287 
288     return(0);
289 }
290 
291 /**
292  * booleanHTMLAttrs:
293  *
294  * These are the HTML attributes which will be output
295  * in minimized form, i.e. <option selected="selected"> will be
296  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
297  *
298  */
299 static const char* htmlBooleanAttrs[] = {
300   "checked", "compact", "declare", "defer", "disabled", "ismap",
301   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
302   "selected", NULL
303 };
304 
305 
306 /**
307  * htmlIsBooleanAttr:
308  * @name:  the name of the attribute to check
309  *
310  * Determine if a given attribute is a boolean attribute.
311  *
312  * returns: false if the attribute is not boolean, true otherwise.
313  */
314 int
htmlIsBooleanAttr(const xmlChar * name)315 htmlIsBooleanAttr(const xmlChar *name)
316 {
317     int i = 0;
318 
319     while (htmlBooleanAttrs[i] != NULL) {
320         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
321             return 1;
322         i++;
323     }
324     return 0;
325 }
326 
327 #ifdef LIBXML_OUTPUT_ENABLED
328 /*
329  * private routine exported from xmlIO.c
330  */
331 xmlOutputBufferPtr
332 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
333 /************************************************************************
334  *									*
335  * 			Output error handlers				*
336  *									*
337  ************************************************************************/
338 /**
339  * htmlSaveErrMemory:
340  * @extra:  extra informations
341  *
342  * Handle an out of memory condition
343  */
344 static void
htmlSaveErrMemory(const char * extra)345 htmlSaveErrMemory(const char *extra)
346 {
347     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
348 }
349 
350 /**
351  * htmlSaveErr:
352  * @code:  the error number
353  * @node:  the location of the error.
354  * @extra:  extra informations
355  *
356  * Handle an out of memory condition
357  */
358 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)359 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
360 {
361     const char *msg = NULL;
362 
363     switch(code) {
364         case XML_SAVE_NOT_UTF8:
365 	    msg = "string is not in UTF-8\n";
366 	    break;
367 	case XML_SAVE_CHAR_INVALID:
368 	    msg = "invalid character value\n";
369 	    break;
370 	case XML_SAVE_UNKNOWN_ENCODING:
371 	    msg = "unknown encoding %s\n";
372 	    break;
373 	case XML_SAVE_NO_DOCTYPE:
374 	    msg = "HTML has no DOCTYPE\n";
375 	    break;
376 	default:
377 	    msg = "unexpected error number\n";
378     }
379     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
380 }
381 
382 /************************************************************************
383  *									*
384  *   		Dumping HTML tree content to a simple buffer		*
385  *									*
386  ************************************************************************/
387 
388 static int
389 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
390 	           int format);
391 
392 /**
393  * htmlNodeDumpFormat:
394  * @buf:  the HTML buffer output
395  * @doc:  the document
396  * @cur:  the current node
397  * @format:  should formatting spaces been added
398  *
399  * Dump an HTML node, recursive behaviour,children are printed too.
400  *
401  * Returns the number of byte written or -1 in case of error
402  */
403 static int
htmlNodeDumpFormat(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)404 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
405 	           int format) {
406     unsigned int use;
407     int ret;
408     xmlOutputBufferPtr outbuf;
409 
410     if (cur == NULL) {
411 	return (-1);
412     }
413     if (buf == NULL) {
414 	return (-1);
415     }
416     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
417     if (outbuf == NULL) {
418         htmlSaveErrMemory("allocating HTML output buffer");
419 	return (-1);
420     }
421     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
422     outbuf->buffer = buf;
423     outbuf->encoder = NULL;
424     outbuf->writecallback = NULL;
425     outbuf->closecallback = NULL;
426     outbuf->context = NULL;
427     outbuf->written = 0;
428 
429     use = buf->use;
430     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
431     xmlFree(outbuf);
432     ret = buf->use - use;
433     return (ret);
434 }
435 
436 /**
437  * htmlNodeDump:
438  * @buf:  the HTML buffer output
439  * @doc:  the document
440  * @cur:  the current node
441  *
442  * Dump an HTML node, recursive behaviour,children are printed too,
443  * and formatting returns are added.
444  *
445  * Returns the number of byte written or -1 in case of error
446  */
447 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)448 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
449     xmlInitParser();
450 
451     return(htmlNodeDumpFormat(buf, doc, cur, 1));
452 }
453 
454 /**
455  * htmlNodeDumpFileFormat:
456  * @out:  the FILE pointer
457  * @doc:  the document
458  * @cur:  the current node
459  * @encoding: the document encoding
460  * @format:  should formatting spaces been added
461  *
462  * Dump an HTML node, recursive behaviour,children are printed too.
463  *
464  * TODO: if encoding == NULL try to save in the doc encoding
465  *
466  * returns: the number of byte written or -1 in case of failure.
467  */
468 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)469 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
470 	               xmlNodePtr cur, const char *encoding, int format) {
471     xmlOutputBufferPtr buf;
472     xmlCharEncodingHandlerPtr handler = NULL;
473     int ret;
474 
475     xmlInitParser();
476 
477     if (encoding != NULL) {
478 	xmlCharEncoding enc;
479 
480 	enc = xmlParseCharEncoding(encoding);
481 	if (enc != XML_CHAR_ENCODING_UTF8) {
482 	    handler = xmlFindCharEncodingHandler(encoding);
483 	    if (handler == NULL)
484 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
485 	}
486     }
487 
488     /*
489      * Fallback to HTML or ASCII when the encoding is unspecified
490      */
491     if (handler == NULL)
492 	handler = xmlFindCharEncodingHandler("HTML");
493     if (handler == NULL)
494 	handler = xmlFindCharEncodingHandler("ascii");
495 
496     /*
497      * save the content to a temp buffer.
498      */
499     buf = xmlOutputBufferCreateFile(out, handler);
500     if (buf == NULL) return(0);
501 
502     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
503 
504     ret = xmlOutputBufferClose(buf);
505     return(ret);
506 }
507 
508 /**
509  * htmlNodeDumpFile:
510  * @out:  the FILE pointer
511  * @doc:  the document
512  * @cur:  the current node
513  *
514  * Dump an HTML node, recursive behaviour,children are printed too,
515  * and formatting returns are added.
516  */
517 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)518 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
519     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
520 }
521 
522 /**
523  * htmlDocDumpMemoryFormat:
524  * @cur:  the document
525  * @mem:  OUT: the memory pointer
526  * @size:  OUT: the memory length
527  * @format:  should formatting spaces been added
528  *
529  * Dump an HTML document in memory and return the xmlChar * and it's size.
530  * It's up to the caller to free the memory.
531  */
532 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)533 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
534     xmlOutputBufferPtr buf;
535     xmlCharEncodingHandlerPtr handler = NULL;
536     const char *encoding;
537 
538     xmlInitParser();
539 
540     if ((mem == NULL) || (size == NULL))
541         return;
542     if (cur == NULL) {
543 	*mem = NULL;
544 	*size = 0;
545 	return;
546     }
547 
548     encoding = (const char *) htmlGetMetaEncoding(cur);
549 
550     if (encoding != NULL) {
551 	xmlCharEncoding enc;
552 
553 	enc = xmlParseCharEncoding(encoding);
554 	if (enc != cur->charset) {
555 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
556 		/*
557 		 * Not supported yet
558 		 */
559 		*mem = NULL;
560 		*size = 0;
561 		return;
562 	    }
563 
564 	    handler = xmlFindCharEncodingHandler(encoding);
565 	    if (handler == NULL)
566                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
567 
568 	} else {
569 	    handler = xmlFindCharEncodingHandler(encoding);
570 	}
571     }
572 
573     /*
574      * Fallback to HTML or ASCII when the encoding is unspecified
575      */
576     if (handler == NULL)
577 	handler = xmlFindCharEncodingHandler("HTML");
578     if (handler == NULL)
579 	handler = xmlFindCharEncodingHandler("ascii");
580 
581     buf = xmlAllocOutputBufferInternal(handler);
582     if (buf == NULL) {
583 	*mem = NULL;
584 	*size = 0;
585 	return;
586     }
587 
588     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
589 
590     xmlOutputBufferFlush(buf);
591     if (buf->conv != NULL) {
592 	*size = buf->conv->use;
593 	*mem = xmlStrndup(buf->conv->content, *size);
594     } else {
595 	*size = buf->buffer->use;
596 	*mem = xmlStrndup(buf->buffer->content, *size);
597     }
598     (void)xmlOutputBufferClose(buf);
599 }
600 
601 /**
602  * htmlDocDumpMemory:
603  * @cur:  the document
604  * @mem:  OUT: the memory pointer
605  * @size:  OUT: the memory length
606  *
607  * Dump an HTML document in memory and return the xmlChar * and it's size.
608  * It's up to the caller to free the memory.
609  */
610 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)611 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
612 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
613 }
614 
615 
616 /************************************************************************
617  *									*
618  *   		Dumping HTML tree content to an I/O output buffer	*
619  *									*
620  ************************************************************************/
621 
622 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
623 
624 /**
625  * htmlDtdDumpOutput:
626  * @buf:  the HTML buffer output
627  * @doc:  the document
628  * @encoding:  the encoding string
629  *
630  * TODO: check whether encoding is needed
631  *
632  * Dump the HTML document DTD, if any.
633  */
634 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)635 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
636 	          const char *encoding ATTRIBUTE_UNUSED) {
637     xmlDtdPtr cur = doc->intSubset;
638 
639     if (cur == NULL) {
640 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
641 	return;
642     }
643     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
644     xmlOutputBufferWriteString(buf, (const char *)cur->name);
645     if (cur->ExternalID != NULL) {
646 	xmlOutputBufferWriteString(buf, " PUBLIC ");
647 	xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
648 	if (cur->SystemID != NULL) {
649 	    xmlOutputBufferWriteString(buf, " ");
650 	    xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
651 	}
652     }  else if (cur->SystemID != NULL) {
653 	xmlOutputBufferWriteString(buf, " SYSTEM ");
654 	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
655     }
656     xmlOutputBufferWriteString(buf, ">\n");
657 }
658 
659 /**
660  * htmlAttrDumpOutput:
661  * @buf:  the HTML buffer output
662  * @doc:  the document
663  * @cur:  the attribute pointer
664  * @encoding:  the encoding string
665  *
666  * Dump an HTML attribute
667  */
668 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)669 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
670 	           const char *encoding ATTRIBUTE_UNUSED) {
671     xmlChar *value;
672 
673     /*
674      * TODO: The html output method should not escape a & character
675      *       occurring in an attribute value immediately followed by
676      *       a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
677      */
678 
679     if (cur == NULL) {
680 	return;
681     }
682     xmlOutputBufferWriteString(buf, " ");
683     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
684         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
685 	xmlOutputBufferWriteString(buf, ":");
686     }
687     xmlOutputBufferWriteString(buf, (const char *)cur->name);
688     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
689 	value = xmlNodeListGetString(doc, cur->children, 0);
690 	if (value) {
691 	    xmlOutputBufferWriteString(buf, "=");
692 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
693 		(cur->parent->ns == NULL) &&
694 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
695 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
696 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
697 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
698 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
699 		xmlChar *escaped;
700 		xmlChar *tmp = value;
701 
702 		while (IS_BLANK_CH(*tmp)) tmp++;
703 
704 		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
705 		if (escaped != NULL) {
706 		    xmlBufferWriteQuotedString(buf->buffer, escaped);
707 		    xmlFree(escaped);
708 		} else {
709 		    xmlBufferWriteQuotedString(buf->buffer, value);
710 		}
711 	    } else {
712 		xmlBufferWriteQuotedString(buf->buffer, value);
713 	    }
714 	    xmlFree(value);
715 	} else  {
716 	    xmlOutputBufferWriteString(buf, "=\"\"");
717 	}
718     }
719 }
720 
721 /**
722  * htmlAttrListDumpOutput:
723  * @buf:  the HTML buffer output
724  * @doc:  the document
725  * @cur:  the first attribute pointer
726  * @encoding:  the encoding string
727  *
728  * Dump a list of HTML attributes
729  */
730 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)731 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
732     if (cur == NULL) {
733 	return;
734     }
735     while (cur != NULL) {
736         htmlAttrDumpOutput(buf, doc, cur, encoding);
737 	cur = cur->next;
738     }
739 }
740 
741 
742 
743 /**
744  * htmlNodeListDumpOutput:
745  * @buf:  the HTML buffer output
746  * @doc:  the document
747  * @cur:  the first node
748  * @encoding:  the encoding string
749  * @format:  should formatting spaces been added
750  *
751  * Dump an HTML node list, recursive behaviour,children are printed too.
752  */
753 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)754 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
755 	               xmlNodePtr cur, const char *encoding, int format) {
756     if (cur == NULL) {
757 	return;
758     }
759     while (cur != NULL) {
760         htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
761 	cur = cur->next;
762     }
763 }
764 
765 /**
766  * htmlNodeDumpFormatOutput:
767  * @buf:  the HTML buffer output
768  * @doc:  the document
769  * @cur:  the current node
770  * @encoding:  the encoding string
771  * @format:  should formatting spaces been added
772  *
773  * Dump an HTML node, recursive behaviour,children are printed too.
774  */
775 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)776 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
777 	                 xmlNodePtr cur, const char *encoding, int format) {
778     const htmlElemDesc * info;
779 
780     xmlInitParser();
781 
782     if ((cur == NULL) || (buf == NULL)) {
783 	return;
784     }
785     /*
786      * Special cases.
787      */
788     if (cur->type == XML_DTD_NODE)
789 	return;
790     if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
791         (cur->type == XML_DOCUMENT_NODE)){
792 	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
793 	return;
794     }
795     if (cur->type == XML_ATTRIBUTE_NODE) {
796         htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
797 	return;
798     }
799     if (cur->type == HTML_TEXT_NODE) {
800 	if (cur->content != NULL) {
801 	    if (((cur->name == (const xmlChar *)xmlStringText) ||
802 		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
803 		((cur->parent == NULL) ||
804 		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
805 		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
806 		xmlChar *buffer;
807 
808 		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
809 		if (buffer != NULL) {
810 		    xmlOutputBufferWriteString(buf, (const char *)buffer);
811 		    xmlFree(buffer);
812 		}
813 	    } else {
814 		xmlOutputBufferWriteString(buf, (const char *)cur->content);
815 	    }
816 	}
817 	return;
818     }
819     if (cur->type == HTML_COMMENT_NODE) {
820 	if (cur->content != NULL) {
821 	    xmlOutputBufferWriteString(buf, "<!--");
822 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
823 	    xmlOutputBufferWriteString(buf, "-->");
824 	}
825 	return;
826     }
827     if (cur->type == HTML_PI_NODE) {
828 	if (cur->name == NULL)
829 	    return;
830 	xmlOutputBufferWriteString(buf, "<?");
831 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
832 	if (cur->content != NULL) {
833 	    xmlOutputBufferWriteString(buf, " ");
834 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
835 	}
836 	xmlOutputBufferWriteString(buf, ">");
837 	return;
838     }
839     if (cur->type == HTML_ENTITY_REF_NODE) {
840         xmlOutputBufferWriteString(buf, "&");
841 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
842         xmlOutputBufferWriteString(buf, ";");
843 	return;
844     }
845     if (cur->type == HTML_PRESERVE_NODE) {
846 	if (cur->content != NULL) {
847 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
848 	}
849 	return;
850     }
851 
852     /*
853      * Get specific HTML info for that node.
854      */
855     if (cur->ns == NULL)
856 	info = htmlTagLookup(cur->name);
857     else
858 	info = NULL;
859 
860     xmlOutputBufferWriteString(buf, "<");
861     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
862         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
863 	xmlOutputBufferWriteString(buf, ":");
864     }
865     xmlOutputBufferWriteString(buf, (const char *)cur->name);
866     if (cur->nsDef)
867 	xmlNsListDumpOutput(buf, cur->nsDef);
868     if (cur->properties != NULL)
869         htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
870 
871     if ((info != NULL) && (info->empty)) {
872         xmlOutputBufferWriteString(buf, ">");
873 	if ((format) && (!info->isinline) && (cur->next != NULL)) {
874 	    if ((cur->next->type != HTML_TEXT_NODE) &&
875 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
876 		(cur->parent != NULL) &&
877 		(cur->parent->name != NULL) &&
878 		(cur->parent->name[0] != 'p')) /* p, pre, param */
879 		xmlOutputBufferWriteString(buf, "\n");
880 	}
881 	return;
882     }
883     if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
884 	(cur->children == NULL)) {
885         if ((info != NULL) && (info->saveEndTag != 0) &&
886 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
887 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
888 	    xmlOutputBufferWriteString(buf, ">");
889 	} else {
890 	    xmlOutputBufferWriteString(buf, "></");
891             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
892                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
893                 xmlOutputBufferWriteString(buf, ":");
894             }
895 	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
896 	    xmlOutputBufferWriteString(buf, ">");
897 	}
898 	if ((format) && (cur->next != NULL) &&
899             (info != NULL) && (!info->isinline)) {
900 	    if ((cur->next->type != HTML_TEXT_NODE) &&
901 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
902 		(cur->parent != NULL) &&
903 		(cur->parent->name != NULL) &&
904 		(cur->parent->name[0] != 'p')) /* p, pre, param */
905 		xmlOutputBufferWriteString(buf, "\n");
906 	}
907 	return;
908     }
909     xmlOutputBufferWriteString(buf, ">");
910     if ((cur->type != XML_ELEMENT_NODE) &&
911 	(cur->content != NULL)) {
912 	    /*
913 	     * Uses the OutputBuffer property to automatically convert
914 	     * invalids to charrefs
915 	     */
916 
917             xmlOutputBufferWriteString(buf, (const char *) cur->content);
918     }
919     if (cur->children != NULL) {
920         if ((format) && (info != NULL) && (!info->isinline) &&
921 	    (cur->children->type != HTML_TEXT_NODE) &&
922 	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
923 	    (cur->children != cur->last) &&
924 	    (cur->name != NULL) &&
925 	    (cur->name[0] != 'p')) /* p, pre, param */
926 	    xmlOutputBufferWriteString(buf, "\n");
927 	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
928         if ((format) && (info != NULL) && (!info->isinline) &&
929 	    (cur->last->type != HTML_TEXT_NODE) &&
930 	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
931 	    (cur->children != cur->last) &&
932 	    (cur->name != NULL) &&
933 	    (cur->name[0] != 'p')) /* p, pre, param */
934 	    xmlOutputBufferWriteString(buf, "\n");
935     }
936     xmlOutputBufferWriteString(buf, "</");
937     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
938         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
939 	xmlOutputBufferWriteString(buf, ":");
940     }
941     xmlOutputBufferWriteString(buf, (const char *)cur->name);
942     xmlOutputBufferWriteString(buf, ">");
943     if ((format) && (info != NULL) && (!info->isinline) &&
944 	(cur->next != NULL)) {
945         if ((cur->next->type != HTML_TEXT_NODE) &&
946 	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
947 	    (cur->parent != NULL) &&
948 	    (cur->parent->name != NULL) &&
949 	    (cur->parent->name[0] != 'p')) /* p, pre, param */
950 	    xmlOutputBufferWriteString(buf, "\n");
951     }
952 }
953 
954 /**
955  * htmlNodeDumpOutput:
956  * @buf:  the HTML buffer output
957  * @doc:  the document
958  * @cur:  the current node
959  * @encoding:  the encoding string
960  *
961  * Dump an HTML node, recursive behaviour,children are printed too,
962  * and formatting returns/spaces are added.
963  */
964 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)965 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
966 	           xmlNodePtr cur, const char *encoding) {
967     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
968 }
969 
970 /**
971  * htmlDocContentDumpFormatOutput:
972  * @buf:  the HTML buffer output
973  * @cur:  the document
974  * @encoding:  the encoding string
975  * @format:  should formatting spaces been added
976  *
977  * Dump an HTML document.
978  */
979 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)980 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
981 	                       const char *encoding, int format) {
982     int type;
983 
984     xmlInitParser();
985 
986     if ((buf == NULL) || (cur == NULL))
987         return;
988 
989     /*
990      * force to output the stuff as HTML, especially for entities
991      */
992     type = cur->type;
993     cur->type = XML_HTML_DOCUMENT_NODE;
994     if (cur->intSubset != NULL) {
995         htmlDtdDumpOutput(buf, cur, NULL);
996     }
997     if (cur->children != NULL) {
998         htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
999     }
1000     xmlOutputBufferWriteString(buf, "\n");
1001     cur->type = (xmlElementType) type;
1002 }
1003 
1004 /**
1005  * htmlDocContentDumpOutput:
1006  * @buf:  the HTML buffer output
1007  * @cur:  the document
1008  * @encoding:  the encoding string
1009  *
1010  * Dump an HTML document. Formating return/spaces are added.
1011  */
1012 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1013 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1014 	                 const char *encoding) {
1015     htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1016 }
1017 
1018 /************************************************************************
1019  *									*
1020  *		Saving functions front-ends				*
1021  *									*
1022  ************************************************************************/
1023 
1024 /**
1025  * htmlDocDump:
1026  * @f:  the FILE*
1027  * @cur:  the document
1028  *
1029  * Dump an HTML document to an open FILE.
1030  *
1031  * returns: the number of byte written or -1 in case of failure.
1032  */
1033 int
htmlDocDump(FILE * f,xmlDocPtr cur)1034 htmlDocDump(FILE *f, xmlDocPtr cur) {
1035     xmlOutputBufferPtr buf;
1036     xmlCharEncodingHandlerPtr handler = NULL;
1037     const char *encoding;
1038     int ret;
1039 
1040     xmlInitParser();
1041 
1042     if ((cur == NULL) || (f == NULL)) {
1043 	return(-1);
1044     }
1045 
1046     encoding = (const char *) htmlGetMetaEncoding(cur);
1047 
1048     if (encoding != NULL) {
1049 	xmlCharEncoding enc;
1050 
1051 	enc = xmlParseCharEncoding(encoding);
1052 	if (enc != cur->charset) {
1053 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1054 		/*
1055 		 * Not supported yet
1056 		 */
1057 		return(-1);
1058 	    }
1059 
1060 	    handler = xmlFindCharEncodingHandler(encoding);
1061 	    if (handler == NULL)
1062 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1063 	} else {
1064 	    handler = xmlFindCharEncodingHandler(encoding);
1065 	}
1066     }
1067 
1068     /*
1069      * Fallback to HTML or ASCII when the encoding is unspecified
1070      */
1071     if (handler == NULL)
1072 	handler = xmlFindCharEncodingHandler("HTML");
1073     if (handler == NULL)
1074 	handler = xmlFindCharEncodingHandler("ascii");
1075 
1076     buf = xmlOutputBufferCreateFile(f, handler);
1077     if (buf == NULL) return(-1);
1078     htmlDocContentDumpOutput(buf, cur, NULL);
1079 
1080     ret = xmlOutputBufferClose(buf);
1081     return(ret);
1082 }
1083 
1084 /**
1085  * htmlSaveFile:
1086  * @filename:  the filename (or URL)
1087  * @cur:  the document
1088  *
1089  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1090  * used.
1091  * returns: the number of byte written or -1 in case of failure.
1092  */
1093 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1094 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1095     xmlOutputBufferPtr buf;
1096     xmlCharEncodingHandlerPtr handler = NULL;
1097     const char *encoding;
1098     int ret;
1099 
1100     if ((cur == NULL) || (filename == NULL))
1101         return(-1);
1102 
1103     xmlInitParser();
1104 
1105     encoding = (const char *) htmlGetMetaEncoding(cur);
1106 
1107     if (encoding != NULL) {
1108 	xmlCharEncoding enc;
1109 
1110 	enc = xmlParseCharEncoding(encoding);
1111 	if (enc != cur->charset) {
1112 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1113 		/*
1114 		 * Not supported yet
1115 		 */
1116 		return(-1);
1117 	    }
1118 
1119 	    handler = xmlFindCharEncodingHandler(encoding);
1120 	    if (handler == NULL)
1121 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1122 	}
1123     }
1124 
1125     /*
1126      * Fallback to HTML or ASCII when the encoding is unspecified
1127      */
1128     if (handler == NULL)
1129 	handler = xmlFindCharEncodingHandler("HTML");
1130     if (handler == NULL)
1131 	handler = xmlFindCharEncodingHandler("ascii");
1132 
1133     /*
1134      * save the content to a temp buffer.
1135      */
1136     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1137     if (buf == NULL) return(0);
1138 
1139     htmlDocContentDumpOutput(buf, cur, NULL);
1140 
1141     ret = xmlOutputBufferClose(buf);
1142     return(ret);
1143 }
1144 
1145 /**
1146  * htmlSaveFileFormat:
1147  * @filename:  the filename
1148  * @cur:  the document
1149  * @format:  should formatting spaces been added
1150  * @encoding: the document encoding
1151  *
1152  * Dump an HTML document to a file using a given encoding.
1153  *
1154  * returns: the number of byte written or -1 in case of failure.
1155  */
1156 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1157 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1158 	           const char *encoding, int format) {
1159     xmlOutputBufferPtr buf;
1160     xmlCharEncodingHandlerPtr handler = NULL;
1161     int ret;
1162 
1163     if ((cur == NULL) || (filename == NULL))
1164         return(-1);
1165 
1166     xmlInitParser();
1167 
1168     if (encoding != NULL) {
1169 	xmlCharEncoding enc;
1170 
1171 	enc = xmlParseCharEncoding(encoding);
1172 	if (enc != cur->charset) {
1173 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1174 		/*
1175 		 * Not supported yet
1176 		 */
1177 		return(-1);
1178 	    }
1179 
1180 	    handler = xmlFindCharEncodingHandler(encoding);
1181 	    if (handler == NULL)
1182 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1183 	}
1184         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1185     } else {
1186 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1187     }
1188 
1189     /*
1190      * Fallback to HTML or ASCII when the encoding is unspecified
1191      */
1192     if (handler == NULL)
1193 	handler = xmlFindCharEncodingHandler("HTML");
1194     if (handler == NULL)
1195 	handler = xmlFindCharEncodingHandler("ascii");
1196 
1197     /*
1198      * save the content to a temp buffer.
1199      */
1200     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1201     if (buf == NULL) return(0);
1202 
1203     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1204 
1205     ret = xmlOutputBufferClose(buf);
1206     return(ret);
1207 }
1208 
1209 /**
1210  * htmlSaveFileEnc:
1211  * @filename:  the filename
1212  * @cur:  the document
1213  * @encoding: the document encoding
1214  *
1215  * Dump an HTML document to a file using a given encoding
1216  * and formatting returns/spaces are added.
1217  *
1218  * returns: the number of byte written or -1 in case of failure.
1219  */
1220 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1221 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1222     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1223 }
1224 
1225 #endif /* LIBXML_OUTPUT_ENABLED */
1226 
1227 #define bottom_HTMLtree
1228 #include "elfgcchack.h"
1229 #endif /* LIBXML_HTML_ENABLED */
1230