• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 
16 #ifdef HAVE_CTYPE_H
17 #include <ctype.h>
18 #endif
19 #ifdef HAVE_STDLIB_H
20 #include <stdlib.h>
21 #endif
22 
23 #include <libxml/xmlmemory.h>
24 #include <libxml/HTMLparser.h>
25 #include <libxml/HTMLtree.h>
26 #include <libxml/entities.h>
27 #include <libxml/valid.h>
28 #include <libxml/xmlerror.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/globals.h>
31 #include <libxml/uri.h>
32 
33 /************************************************************************
34  *									*
35  *   		Getting/Setting encoding meta tags			*
36  *									*
37  ************************************************************************/
38 
39 /**
40  * htmlGetMetaEncoding:
41  * @doc:  the document
42  *
43  * Encoding definition lookup in the Meta tags
44  *
45  * Returns the current encoding as flagged in the HTML source
46  */
47 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)48 htmlGetMetaEncoding(htmlDocPtr doc) {
49     htmlNodePtr cur;
50     const xmlChar *content;
51     const xmlChar *encoding;
52 
53     if (doc == NULL)
54 	return(NULL);
55     cur = doc->children;
56 
57     /*
58      * Search the html
59      */
60     while (cur != NULL) {
61 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
63 		break;
64 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
65 		goto found_head;
66 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67 		goto found_meta;
68 	}
69 	cur = cur->next;
70     }
71     if (cur == NULL)
72 	return(NULL);
73     cur = cur->children;
74 
75     /*
76      * Search the head
77      */
78     while (cur != NULL) {
79 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
81 		break;
82 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83 		goto found_meta;
84 	}
85 	cur = cur->next;
86     }
87     if (cur == NULL)
88 	return(NULL);
89 found_head:
90     cur = cur->children;
91 
92     /*
93      * Search the meta elements
94      */
95 found_meta:
96     while (cur != NULL) {
97 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99 		xmlAttrPtr attr = cur->properties;
100 		int http;
101 		const xmlChar *value;
102 
103 		content = NULL;
104 		http = 0;
105 		while (attr != NULL) {
106 		    if ((attr->children != NULL) &&
107 		        (attr->children->type == XML_TEXT_NODE) &&
108 		        (attr->children->next == NULL)) {
109 			value = attr->children->content;
110 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112 			    http = 1;
113 			else if ((value != NULL)
114 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115 			    content = value;
116 			if ((http != 0) && (content != NULL))
117 			    goto found_content;
118 		    }
119 		    attr = attr->next;
120 		}
121 	    }
122 	}
123 	cur = cur->next;
124     }
125     return(NULL);
126 
127 found_content:
128     encoding = xmlStrstr(content, BAD_CAST"charset=");
129     if (encoding == NULL)
130 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
131     if (encoding == NULL)
132 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133     if (encoding != NULL) {
134 	encoding += 8;
135     } else {
136 	encoding = xmlStrstr(content, BAD_CAST"charset =");
137 	if (encoding == NULL)
138 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
139 	if (encoding == NULL)
140 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141 	if (encoding != NULL)
142 	    encoding += 9;
143     }
144     if (encoding != NULL) {
145 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146     }
147     return(encoding);
148 }
149 
150 /**
151  * htmlSetMetaEncoding:
152  * @doc:  the document
153  * @encoding:  the encoding string
154  *
155  * Sets the current encoding in the Meta tags
156  * NOTE: this will not change the document content encoding, just
157  * the META flag associated.
158  *
159  * Returns 0 in case of success and -1 in case of error
160  */
161 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)162 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163     htmlNodePtr cur, meta = NULL, head = NULL;
164     const xmlChar *content = NULL;
165     char newcontent[100];
166 
167 
168     if (doc == NULL)
169 	return(-1);
170 
171     /* html isn't a real encoding it's just libxml2 way to get entities */
172     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
173         return(-1);
174 
175     if (encoding != NULL) {
176 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
177                 (char *)encoding);
178 	newcontent[sizeof(newcontent) - 1] = 0;
179     }
180 
181     cur = doc->children;
182 
183     /*
184      * Search the html
185      */
186     while (cur != NULL) {
187 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
188 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
189 		break;
190 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
191 		goto found_head;
192 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
193 		goto found_meta;
194 	}
195 	cur = cur->next;
196     }
197     if (cur == NULL)
198 	return(-1);
199     cur = cur->children;
200 
201     /*
202      * Search the head
203      */
204     while (cur != NULL) {
205 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
206 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
207 		break;
208 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
209                 head = cur->parent;
210 		goto found_meta;
211             }
212 	}
213 	cur = cur->next;
214     }
215     if (cur == NULL)
216 	return(-1);
217 found_head:
218     head = cur;
219     if (cur->children == NULL)
220         goto create;
221     cur = cur->children;
222 
223 found_meta:
224     /*
225      * Search and update all the remaining the meta elements carrying
226      * encoding informations
227      */
228     while (cur != NULL) {
229 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
230 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
231 		xmlAttrPtr attr = cur->properties;
232 		int http;
233 		const xmlChar *value;
234 
235 		content = NULL;
236 		http = 0;
237 		while (attr != NULL) {
238 		    if ((attr->children != NULL) &&
239 		        (attr->children->type == XML_TEXT_NODE) &&
240 		        (attr->children->next == NULL)) {
241 			value = attr->children->content;
242 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
243 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
244 			    http = 1;
245 			else
246                         {
247                            if ((value != NULL) &&
248                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
249 			       content = value;
250                         }
251 		        if ((http != 0) && (content != NULL))
252 			    break;
253 		    }
254 		    attr = attr->next;
255 		}
256 		if ((http != 0) && (content != NULL)) {
257 		    meta = cur;
258 		    break;
259 		}
260 
261 	    }
262 	}
263 	cur = cur->next;
264     }
265 create:
266     if (meta == NULL) {
267         if ((encoding != NULL) && (head != NULL)) {
268             /*
269              * Create a new Meta element with the right attributes
270              */
271 
272             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
273             if (head->children == NULL)
274                 xmlAddChild(head, meta);
275             else
276                 xmlAddPrevSibling(head->children, meta);
277             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
278             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
279         }
280     } else {
281         /* change the document only if there is a real encoding change */
282         if (xmlStrcasestr(content, encoding) == NULL) {
283             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
284         }
285     }
286 
287 
288     return(0);
289 }
290 
291 /**
292  * booleanHTMLAttrs:
293  *
294  * These are the HTML attributes which will be output
295  * in minimized form, i.e. <option selected="selected"> will be
296  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
297  *
298  */
299 static const char* htmlBooleanAttrs[] = {
300   "checked", "compact", "declare", "defer", "disabled", "ismap",
301   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
302   "selected", NULL
303 };
304 
305 
306 /**
307  * htmlIsBooleanAttr:
308  * @name:  the name of the attribute to check
309  *
310  * Determine if a given attribute is a boolean attribute.
311  *
312  * returns: false if the attribute is not boolean, true otherwise.
313  */
314 int
htmlIsBooleanAttr(const xmlChar * name)315 htmlIsBooleanAttr(const xmlChar *name)
316 {
317     int i = 0;
318 
319     while (htmlBooleanAttrs[i] != NULL) {
320         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
321             return 1;
322         i++;
323     }
324     return 0;
325 }
326 
327 #ifdef LIBXML_OUTPUT_ENABLED
328 /*
329  * private routine exported from xmlIO.c
330  */
331 xmlOutputBufferPtr
332 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
333 /************************************************************************
334  *									*
335  * 			Output error handlers				*
336  *									*
337  ************************************************************************/
338 /**
339  * htmlSaveErrMemory:
340  * @extra:  extra informations
341  *
342  * Handle an out of memory condition
343  */
344 static void
htmlSaveErrMemory(const char * extra)345 htmlSaveErrMemory(const char *extra)
346 {
347     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
348 }
349 
350 /**
351  * htmlSaveErr:
352  * @code:  the error number
353  * @node:  the location of the error.
354  * @extra:  extra informations
355  *
356  * Handle an out of memory condition
357  */
358 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)359 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
360 {
361     const char *msg = NULL;
362 
363     switch(code) {
364         case XML_SAVE_NOT_UTF8:
365 	    msg = "string is not in UTF-8\n";
366 	    break;
367 	case XML_SAVE_CHAR_INVALID:
368 	    msg = "invalid character value\n";
369 	    break;
370 	case XML_SAVE_UNKNOWN_ENCODING:
371 	    msg = "unknown encoding %s\n";
372 	    break;
373 	case XML_SAVE_NO_DOCTYPE:
374 	    msg = "HTML has no DOCTYPE\n";
375 	    break;
376 	default:
377 	    msg = "unexpected error number\n";
378     }
379     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
380 }
381 
382 /************************************************************************
383  *									*
384  *   		Dumping HTML tree content to a simple buffer		*
385  *									*
386  ************************************************************************/
387 
388 static int
389 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
390 	           int format);
391 
392 /**
393  * htmlNodeDumpFormat:
394  * @buf:  the HTML buffer output
395  * @doc:  the document
396  * @cur:  the current node
397  * @format:  should formatting spaces been added
398  *
399  * Dump an HTML node, recursive behaviour,children are printed too.
400  *
401  * Returns the number of byte written or -1 in case of error
402  */
403 static int
htmlNodeDumpFormat(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)404 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
405 	           int format) {
406     unsigned int use;
407     int ret;
408     xmlOutputBufferPtr outbuf;
409 
410     if (cur == NULL) {
411 	return (-1);
412     }
413     if (buf == NULL) {
414 	return (-1);
415     }
416     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
417     if (outbuf == NULL) {
418         htmlSaveErrMemory("allocating HTML output buffer");
419 	return (-1);
420     }
421     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
422     outbuf->buffer = buf;
423     outbuf->encoder = NULL;
424     outbuf->writecallback = NULL;
425     outbuf->closecallback = NULL;
426     outbuf->context = NULL;
427     outbuf->written = 0;
428 
429     use = buf->use;
430     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
431     xmlFree(outbuf);
432     ret = buf->use - use;
433     return (ret);
434 }
435 
436 /**
437  * htmlNodeDump:
438  * @buf:  the HTML buffer output
439  * @doc:  the document
440  * @cur:  the current node
441  *
442  * Dump an HTML node, recursive behaviour,children are printed too,
443  * and formatting returns are added.
444  *
445  * Returns the number of byte written or -1 in case of error
446  */
447 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)448 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
449     xmlInitParser();
450 
451     return(htmlNodeDumpFormat(buf, doc, cur, 1));
452 }
453 
454 /**
455  * htmlNodeDumpFileFormat:
456  * @out:  the FILE pointer
457  * @doc:  the document
458  * @cur:  the current node
459  * @encoding: the document encoding
460  * @format:  should formatting spaces been added
461  *
462  * Dump an HTML node, recursive behaviour,children are printed too.
463  *
464  * TODO: if encoding == NULL try to save in the doc encoding
465  *
466  * returns: the number of byte written or -1 in case of failure.
467  */
468 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)469 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
470 	               xmlNodePtr cur, const char *encoding, int format) {
471     xmlOutputBufferPtr buf;
472     xmlCharEncodingHandlerPtr handler = NULL;
473     int ret;
474 
475     xmlInitParser();
476 
477     if (encoding != NULL) {
478 	xmlCharEncoding enc;
479 
480 	enc = xmlParseCharEncoding(encoding);
481 	if (enc != XML_CHAR_ENCODING_UTF8) {
482 	    handler = xmlFindCharEncodingHandler(encoding);
483 	    if (handler == NULL)
484 		return(-1);
485 	}
486     }
487 
488     /*
489      * Fallback to HTML or ASCII when the encoding is unspecified
490      */
491     if (handler == NULL)
492 	handler = xmlFindCharEncodingHandler("HTML");
493     if (handler == NULL)
494 	handler = xmlFindCharEncodingHandler("ascii");
495 
496     /*
497      * save the content to a temp buffer.
498      */
499     buf = xmlOutputBufferCreateFile(out, handler);
500     if (buf == NULL) return(0);
501 
502     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
503 
504     ret = xmlOutputBufferClose(buf);
505     return(ret);
506 }
507 
508 /**
509  * htmlNodeDumpFile:
510  * @out:  the FILE pointer
511  * @doc:  the document
512  * @cur:  the current node
513  *
514  * Dump an HTML node, recursive behaviour,children are printed too,
515  * and formatting returns are added.
516  */
517 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)518 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
519     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
520 }
521 
522 /**
523  * htmlDocDumpMemoryFormat:
524  * @cur:  the document
525  * @mem:  OUT: the memory pointer
526  * @size:  OUT: the memory length
527  * @format:  should formatting spaces been added
528  *
529  * Dump an HTML document in memory and return the xmlChar * and it's size.
530  * It's up to the caller to free the memory.
531  */
532 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)533 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
534     xmlOutputBufferPtr buf;
535     xmlCharEncodingHandlerPtr handler = NULL;
536     const char *encoding;
537 
538     xmlInitParser();
539 
540     if ((mem == NULL) || (size == NULL))
541         return;
542     if (cur == NULL) {
543 	*mem = NULL;
544 	*size = 0;
545 	return;
546     }
547 
548     encoding = (const char *) htmlGetMetaEncoding(cur);
549 
550     if (encoding != NULL) {
551 	xmlCharEncoding enc;
552 
553 	enc = xmlParseCharEncoding(encoding);
554 	if (enc != cur->charset) {
555 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
556 		/*
557 		 * Not supported yet
558 		 */
559 		*mem = NULL;
560 		*size = 0;
561 		return;
562 	    }
563 
564 	    handler = xmlFindCharEncodingHandler(encoding);
565 	    if (handler == NULL) {
566 		*mem = NULL;
567 		*size = 0;
568 		return;
569 	    }
570 	} else {
571 	    handler = xmlFindCharEncodingHandler(encoding);
572 	}
573     }
574 
575     /*
576      * Fallback to HTML or ASCII when the encoding is unspecified
577      */
578     if (handler == NULL)
579 	handler = xmlFindCharEncodingHandler("HTML");
580     if (handler == NULL)
581 	handler = xmlFindCharEncodingHandler("ascii");
582 
583     buf = xmlAllocOutputBufferInternal(handler);
584     if (buf == NULL) {
585 	*mem = NULL;
586 	*size = 0;
587 	return;
588     }
589 
590 	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
591 
592     xmlOutputBufferFlush(buf);
593     if (buf->conv != NULL) {
594 	*size = buf->conv->use;
595 	*mem = xmlStrndup(buf->conv->content, *size);
596     } else {
597 	*size = buf->buffer->use;
598 	*mem = xmlStrndup(buf->buffer->content, *size);
599     }
600     (void)xmlOutputBufferClose(buf);
601 }
602 
603 /**
604  * htmlDocDumpMemory:
605  * @cur:  the document
606  * @mem:  OUT: the memory pointer
607  * @size:  OUT: the memory length
608  *
609  * Dump an HTML document in memory and return the xmlChar * and it's size.
610  * It's up to the caller to free the memory.
611  */
612 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)613 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
614 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
615 }
616 
617 
618 /************************************************************************
619  *									*
620  *   		Dumping HTML tree content to an I/O output buffer	*
621  *									*
622  ************************************************************************/
623 
624 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
625 
626 /**
627  * htmlDtdDumpOutput:
628  * @buf:  the HTML buffer output
629  * @doc:  the document
630  * @encoding:  the encoding string
631  *
632  * TODO: check whether encoding is needed
633  *
634  * Dump the HTML document DTD, if any.
635  */
636 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)637 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
638 	          const char *encoding ATTRIBUTE_UNUSED) {
639     xmlDtdPtr cur = doc->intSubset;
640 
641     if (cur == NULL) {
642 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
643 	return;
644     }
645     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
646     xmlOutputBufferWriteString(buf, (const char *)cur->name);
647     if (cur->ExternalID != NULL) {
648 	xmlOutputBufferWriteString(buf, " PUBLIC ");
649 	xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
650 	if (cur->SystemID != NULL) {
651 	    xmlOutputBufferWriteString(buf, " ");
652 	    xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
653 	}
654     }  else if (cur->SystemID != NULL) {
655 	xmlOutputBufferWriteString(buf, " SYSTEM ");
656 	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
657     }
658     xmlOutputBufferWriteString(buf, ">\n");
659 }
660 
661 /**
662  * htmlAttrDumpOutput:
663  * @buf:  the HTML buffer output
664  * @doc:  the document
665  * @cur:  the attribute pointer
666  * @encoding:  the encoding string
667  *
668  * Dump an HTML attribute
669  */
670 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)671 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
672 	           const char *encoding ATTRIBUTE_UNUSED) {
673     xmlChar *value;
674 
675     /*
676      * TODO: The html output method should not escape a & character
677      *       occurring in an attribute value immediately followed by
678      *       a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
679      */
680 
681     if (cur == NULL) {
682 	return;
683     }
684     xmlOutputBufferWriteString(buf, " ");
685     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
686         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
687 	xmlOutputBufferWriteString(buf, ":");
688     }
689     xmlOutputBufferWriteString(buf, (const char *)cur->name);
690     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
691 	value = xmlNodeListGetString(doc, cur->children, 0);
692 	if (value) {
693 	    xmlOutputBufferWriteString(buf, "=");
694 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
695 		(cur->parent->ns == NULL) &&
696 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
697 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
698 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
699 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
700 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
701 		xmlChar *escaped;
702 		xmlChar *tmp = value;
703 
704 		while (IS_BLANK_CH(*tmp)) tmp++;
705 
706 		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
707 		if (escaped != NULL) {
708 		    xmlBufferWriteQuotedString(buf->buffer, escaped);
709 		    xmlFree(escaped);
710 		} else {
711 		    xmlBufferWriteQuotedString(buf->buffer, value);
712 		}
713 	    } else {
714 		xmlBufferWriteQuotedString(buf->buffer, value);
715 	    }
716 	    xmlFree(value);
717 	} else  {
718 	    xmlOutputBufferWriteString(buf, "=\"\"");
719 	}
720     }
721 }
722 
723 /**
724  * htmlAttrListDumpOutput:
725  * @buf:  the HTML buffer output
726  * @doc:  the document
727  * @cur:  the first attribute pointer
728  * @encoding:  the encoding string
729  *
730  * Dump a list of HTML attributes
731  */
732 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)733 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
734     if (cur == NULL) {
735 	return;
736     }
737     while (cur != NULL) {
738         htmlAttrDumpOutput(buf, doc, cur, encoding);
739 	cur = cur->next;
740     }
741 }
742 
743 
744 
745 /**
746  * htmlNodeListDumpOutput:
747  * @buf:  the HTML buffer output
748  * @doc:  the document
749  * @cur:  the first node
750  * @encoding:  the encoding string
751  * @format:  should formatting spaces been added
752  *
753  * Dump an HTML node list, recursive behaviour,children are printed too.
754  */
755 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)756 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
757 	               xmlNodePtr cur, const char *encoding, int format) {
758     if (cur == NULL) {
759 	return;
760     }
761     while (cur != NULL) {
762         htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
763 	cur = cur->next;
764     }
765 }
766 
767 /**
768  * htmlNodeDumpFormatOutput:
769  * @buf:  the HTML buffer output
770  * @doc:  the document
771  * @cur:  the current node
772  * @encoding:  the encoding string
773  * @format:  should formatting spaces been added
774  *
775  * Dump an HTML node, recursive behaviour,children are printed too.
776  */
777 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)778 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
779 	                 xmlNodePtr cur, const char *encoding, int format) {
780     const htmlElemDesc * info;
781 
782     xmlInitParser();
783 
784     if ((cur == NULL) || (buf == NULL)) {
785 	return;
786     }
787     /*
788      * Special cases.
789      */
790     if (cur->type == XML_DTD_NODE)
791 	return;
792     if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
793         (cur->type == XML_DOCUMENT_NODE)){
794 	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
795 	return;
796     }
797     if (cur->type == XML_ATTRIBUTE_NODE) {
798         htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
799 	return;
800     }
801     if (cur->type == HTML_TEXT_NODE) {
802 	if (cur->content != NULL) {
803 	    if (((cur->name == (const xmlChar *)xmlStringText) ||
804 		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
805 		((cur->parent == NULL) ||
806 		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
807 		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
808 		xmlChar *buffer;
809 
810 		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
811 		if (buffer != NULL) {
812 		    xmlOutputBufferWriteString(buf, (const char *)buffer);
813 		    xmlFree(buffer);
814 		}
815 	    } else {
816 		xmlOutputBufferWriteString(buf, (const char *)cur->content);
817 	    }
818 	}
819 	return;
820     }
821     if (cur->type == HTML_COMMENT_NODE) {
822 	if (cur->content != NULL) {
823 	    xmlOutputBufferWriteString(buf, "<!--");
824 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
825 	    xmlOutputBufferWriteString(buf, "-->");
826 	}
827 	return;
828     }
829     if (cur->type == HTML_PI_NODE) {
830 	if (cur->name == NULL)
831 	    return;
832 	xmlOutputBufferWriteString(buf, "<?");
833 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
834 	if (cur->content != NULL) {
835 	    xmlOutputBufferWriteString(buf, " ");
836 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
837 	}
838 	xmlOutputBufferWriteString(buf, ">");
839 	return;
840     }
841     if (cur->type == HTML_ENTITY_REF_NODE) {
842         xmlOutputBufferWriteString(buf, "&");
843 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
844         xmlOutputBufferWriteString(buf, ";");
845 	return;
846     }
847     if (cur->type == HTML_PRESERVE_NODE) {
848 	if (cur->content != NULL) {
849 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
850 	}
851 	return;
852     }
853 
854     /*
855      * Get specific HTML info for that node.
856      */
857     if (cur->ns == NULL)
858 	info = htmlTagLookup(cur->name);
859     else
860 	info = NULL;
861 
862     xmlOutputBufferWriteString(buf, "<");
863     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
864         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
865 	xmlOutputBufferWriteString(buf, ":");
866     }
867     xmlOutputBufferWriteString(buf, (const char *)cur->name);
868     if (cur->nsDef)
869 	xmlNsListDumpOutput(buf, cur->nsDef);
870     if (cur->properties != NULL)
871         htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
872 
873     if ((info != NULL) && (info->empty)) {
874         xmlOutputBufferWriteString(buf, ">");
875 	if ((format) && (!info->isinline) && (cur->next != NULL)) {
876 	    if ((cur->next->type != HTML_TEXT_NODE) &&
877 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
878 		(cur->parent != NULL) &&
879 		(cur->parent->name != NULL) &&
880 		(cur->parent->name[0] != 'p')) /* p, pre, param */
881 		xmlOutputBufferWriteString(buf, "\n");
882 	}
883 	return;
884     }
885     if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
886 	(cur->children == NULL)) {
887         if ((info != NULL) && (info->saveEndTag != 0) &&
888 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
889 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
890 	    xmlOutputBufferWriteString(buf, ">");
891 	} else {
892 	    xmlOutputBufferWriteString(buf, "></");
893             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
894                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
895                 xmlOutputBufferWriteString(buf, ":");
896             }
897 	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
898 	    xmlOutputBufferWriteString(buf, ">");
899 	}
900 	if ((format) && (cur->next != NULL) &&
901             (info != NULL) && (!info->isinline)) {
902 	    if ((cur->next->type != HTML_TEXT_NODE) &&
903 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
904 		(cur->parent != NULL) &&
905 		(cur->parent->name != NULL) &&
906 		(cur->parent->name[0] != 'p')) /* p, pre, param */
907 		xmlOutputBufferWriteString(buf, "\n");
908 	}
909 	return;
910     }
911     xmlOutputBufferWriteString(buf, ">");
912     if ((cur->type != XML_ELEMENT_NODE) &&
913 	(cur->content != NULL)) {
914 	    /*
915 	     * Uses the OutputBuffer property to automatically convert
916 	     * invalids to charrefs
917 	     */
918 
919             xmlOutputBufferWriteString(buf, (const char *) cur->content);
920     }
921     if (cur->children != NULL) {
922         if ((format) && (info != NULL) && (!info->isinline) &&
923 	    (cur->children->type != HTML_TEXT_NODE) &&
924 	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
925 	    (cur->children != cur->last) &&
926 	    (cur->name != NULL) &&
927 	    (cur->name[0] != 'p')) /* p, pre, param */
928 	    xmlOutputBufferWriteString(buf, "\n");
929 	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
930         if ((format) && (info != NULL) && (!info->isinline) &&
931 	    (cur->last->type != HTML_TEXT_NODE) &&
932 	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
933 	    (cur->children != cur->last) &&
934 	    (cur->name != NULL) &&
935 	    (cur->name[0] != 'p')) /* p, pre, param */
936 	    xmlOutputBufferWriteString(buf, "\n");
937     }
938     xmlOutputBufferWriteString(buf, "</");
939     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941 	xmlOutputBufferWriteString(buf, ":");
942     }
943     xmlOutputBufferWriteString(buf, (const char *)cur->name);
944     xmlOutputBufferWriteString(buf, ">");
945     if ((format) && (info != NULL) && (!info->isinline) &&
946 	(cur->next != NULL)) {
947         if ((cur->next->type != HTML_TEXT_NODE) &&
948 	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
949 	    (cur->parent != NULL) &&
950 	    (cur->parent->name != NULL) &&
951 	    (cur->parent->name[0] != 'p')) /* p, pre, param */
952 	    xmlOutputBufferWriteString(buf, "\n");
953     }
954 }
955 
956 /**
957  * htmlNodeDumpOutput:
958  * @buf:  the HTML buffer output
959  * @doc:  the document
960  * @cur:  the current node
961  * @encoding:  the encoding string
962  *
963  * Dump an HTML node, recursive behaviour,children are printed too,
964  * and formatting returns/spaces are added.
965  */
966 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)967 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
968 	           xmlNodePtr cur, const char *encoding) {
969     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
970 }
971 
972 /**
973  * htmlDocContentDumpFormatOutput:
974  * @buf:  the HTML buffer output
975  * @cur:  the document
976  * @encoding:  the encoding string
977  * @format:  should formatting spaces been added
978  *
979  * Dump an HTML document.
980  */
981 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)982 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
983 	                       const char *encoding, int format) {
984     int type;
985 
986     xmlInitParser();
987 
988     if ((buf == NULL) || (cur == NULL))
989         return;
990 
991     /*
992      * force to output the stuff as HTML, especially for entities
993      */
994     type = cur->type;
995     cur->type = XML_HTML_DOCUMENT_NODE;
996     if (cur->intSubset != NULL) {
997         htmlDtdDumpOutput(buf, cur, NULL);
998     }
999     if (cur->children != NULL) {
1000         htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1001     }
1002     xmlOutputBufferWriteString(buf, "\n");
1003     cur->type = (xmlElementType) type;
1004 }
1005 
1006 /**
1007  * htmlDocContentDumpOutput:
1008  * @buf:  the HTML buffer output
1009  * @cur:  the document
1010  * @encoding:  the encoding string
1011  *
1012  * Dump an HTML document. Formating return/spaces are added.
1013  */
1014 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1015 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1016 	                 const char *encoding) {
1017     htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1018 }
1019 
1020 /************************************************************************
1021  *									*
1022  *		Saving functions front-ends				*
1023  *									*
1024  ************************************************************************/
1025 
1026 /**
1027  * htmlDocDump:
1028  * @f:  the FILE*
1029  * @cur:  the document
1030  *
1031  * Dump an HTML document to an open FILE.
1032  *
1033  * returns: the number of byte written or -1 in case of failure.
1034  */
1035 int
htmlDocDump(FILE * f,xmlDocPtr cur)1036 htmlDocDump(FILE *f, xmlDocPtr cur) {
1037     xmlOutputBufferPtr buf;
1038     xmlCharEncodingHandlerPtr handler = NULL;
1039     const char *encoding;
1040     int ret;
1041 
1042     xmlInitParser();
1043 
1044     if ((cur == NULL) || (f == NULL)) {
1045 	return(-1);
1046     }
1047 
1048     encoding = (const char *) htmlGetMetaEncoding(cur);
1049 
1050     if (encoding != NULL) {
1051 	xmlCharEncoding enc;
1052 
1053 	enc = xmlParseCharEncoding(encoding);
1054 	if (enc != cur->charset) {
1055 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1056 		/*
1057 		 * Not supported yet
1058 		 */
1059 		return(-1);
1060 	    }
1061 
1062 	    handler = xmlFindCharEncodingHandler(encoding);
1063 	    if (handler == NULL)
1064 		return(-1);
1065 	} else {
1066 	    handler = xmlFindCharEncodingHandler(encoding);
1067 	}
1068     }
1069 
1070     /*
1071      * Fallback to HTML or ASCII when the encoding is unspecified
1072      */
1073     if (handler == NULL)
1074 	handler = xmlFindCharEncodingHandler("HTML");
1075     if (handler == NULL)
1076 	handler = xmlFindCharEncodingHandler("ascii");
1077 
1078     buf = xmlOutputBufferCreateFile(f, handler);
1079     if (buf == NULL) return(-1);
1080     htmlDocContentDumpOutput(buf, cur, NULL);
1081 
1082     ret = xmlOutputBufferClose(buf);
1083     return(ret);
1084 }
1085 
1086 /**
1087  * htmlSaveFile:
1088  * @filename:  the filename (or URL)
1089  * @cur:  the document
1090  *
1091  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1092  * used.
1093  * returns: the number of byte written or -1 in case of failure.
1094  */
1095 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1096 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1097     xmlOutputBufferPtr buf;
1098     xmlCharEncodingHandlerPtr handler = NULL;
1099     const char *encoding;
1100     int ret;
1101 
1102     if ((cur == NULL) || (filename == NULL))
1103         return(-1);
1104 
1105     xmlInitParser();
1106 
1107     encoding = (const char *) htmlGetMetaEncoding(cur);
1108 
1109     if (encoding != NULL) {
1110 	xmlCharEncoding enc;
1111 
1112 	enc = xmlParseCharEncoding(encoding);
1113 	if (enc != cur->charset) {
1114 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1115 		/*
1116 		 * Not supported yet
1117 		 */
1118 		return(-1);
1119 	    }
1120 
1121 	    handler = xmlFindCharEncodingHandler(encoding);
1122 	    if (handler == NULL)
1123 		return(-1);
1124 	}
1125     }
1126 
1127     /*
1128      * Fallback to HTML or ASCII when the encoding is unspecified
1129      */
1130     if (handler == NULL)
1131 	handler = xmlFindCharEncodingHandler("HTML");
1132     if (handler == NULL)
1133 	handler = xmlFindCharEncodingHandler("ascii");
1134 
1135     /*
1136      * save the content to a temp buffer.
1137      */
1138     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1139     if (buf == NULL) return(0);
1140 
1141     htmlDocContentDumpOutput(buf, cur, NULL);
1142 
1143     ret = xmlOutputBufferClose(buf);
1144     return(ret);
1145 }
1146 
1147 /**
1148  * htmlSaveFileFormat:
1149  * @filename:  the filename
1150  * @cur:  the document
1151  * @format:  should formatting spaces been added
1152  * @encoding: the document encoding
1153  *
1154  * Dump an HTML document to a file using a given encoding.
1155  *
1156  * returns: the number of byte written or -1 in case of failure.
1157  */
1158 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1159 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1160 	           const char *encoding, int format) {
1161     xmlOutputBufferPtr buf;
1162     xmlCharEncodingHandlerPtr handler = NULL;
1163     int ret;
1164 
1165     if ((cur == NULL) || (filename == NULL))
1166         return(-1);
1167 
1168     xmlInitParser();
1169 
1170     if (encoding != NULL) {
1171 	xmlCharEncoding enc;
1172 
1173 	enc = xmlParseCharEncoding(encoding);
1174 	if (enc != cur->charset) {
1175 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1176 		/*
1177 		 * Not supported yet
1178 		 */
1179 		return(-1);
1180 	    }
1181 
1182 	    handler = xmlFindCharEncodingHandler(encoding);
1183 	    if (handler == NULL)
1184 		return(-1);
1185 	}
1186         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1187     } else {
1188 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1189     }
1190 
1191     /*
1192      * Fallback to HTML or ASCII when the encoding is unspecified
1193      */
1194     if (handler == NULL)
1195 	handler = xmlFindCharEncodingHandler("HTML");
1196     if (handler == NULL)
1197 	handler = xmlFindCharEncodingHandler("ascii");
1198 
1199     /*
1200      * save the content to a temp buffer.
1201      */
1202     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1203     if (buf == NULL) return(0);
1204 
1205     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1206 
1207     ret = xmlOutputBufferClose(buf);
1208     return(ret);
1209 }
1210 
1211 /**
1212  * htmlSaveFileEnc:
1213  * @filename:  the filename
1214  * @cur:  the document
1215  * @encoding: the document encoding
1216  *
1217  * Dump an HTML document to a file using a given encoding
1218  * and formatting returns/spaces are added.
1219  *
1220  * returns: the number of byte written or -1 in case of failure.
1221  */
1222 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1223 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1224     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1225 }
1226 
1227 #endif /* LIBXML_OUTPUT_ENABLED */
1228 
1229 #define bottom_HTMLtree
1230 #include "elfgcchack.h"
1231 #endif /* LIBXML_HTML_ENABLED */
1232