1 /*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13
14 #include <string.h> /* for memset() only ! */
15
16 #ifdef HAVE_CTYPE_H
17 #include <ctype.h>
18 #endif
19 #ifdef HAVE_STDLIB_H
20 #include <stdlib.h>
21 #endif
22
23 #include <libxml/xmlmemory.h>
24 #include <libxml/HTMLparser.h>
25 #include <libxml/HTMLtree.h>
26 #include <libxml/entities.h>
27 #include <libxml/valid.h>
28 #include <libxml/xmlerror.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/globals.h>
31 #include <libxml/uri.h>
32
33 #include "buf.h"
34
35 /************************************************************************
36 * *
37 * Getting/Setting encoding meta tags *
38 * *
39 ************************************************************************/
40
41 /**
42 * htmlGetMetaEncoding:
43 * @doc: the document
44 *
45 * Encoding definition lookup in the Meta tags
46 *
47 * Returns the current encoding as flagged in the HTML source
48 */
49 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)50 htmlGetMetaEncoding(htmlDocPtr doc) {
51 htmlNodePtr cur;
52 const xmlChar *content;
53 const xmlChar *encoding;
54
55 if (doc == NULL)
56 return(NULL);
57 cur = doc->children;
58
59 /*
60 * Search the html
61 */
62 while (cur != NULL) {
63 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
64 if (xmlStrEqual(cur->name, BAD_CAST"html"))
65 break;
66 if (xmlStrEqual(cur->name, BAD_CAST"head"))
67 goto found_head;
68 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
69 goto found_meta;
70 }
71 cur = cur->next;
72 }
73 if (cur == NULL)
74 return(NULL);
75 cur = cur->children;
76
77 /*
78 * Search the head
79 */
80 while (cur != NULL) {
81 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
82 if (xmlStrEqual(cur->name, BAD_CAST"head"))
83 break;
84 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
85 goto found_meta;
86 }
87 cur = cur->next;
88 }
89 if (cur == NULL)
90 return(NULL);
91 found_head:
92 cur = cur->children;
93
94 /*
95 * Search the meta elements
96 */
97 found_meta:
98 while (cur != NULL) {
99 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
100 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
101 xmlAttrPtr attr = cur->properties;
102 int http;
103 const xmlChar *value;
104
105 content = NULL;
106 http = 0;
107 while (attr != NULL) {
108 if ((attr->children != NULL) &&
109 (attr->children->type == XML_TEXT_NODE) &&
110 (attr->children->next == NULL)) {
111 value = attr->children->content;
112 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
113 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
114 http = 1;
115 else if ((value != NULL)
116 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
117 content = value;
118 if ((http != 0) && (content != NULL))
119 goto found_content;
120 }
121 attr = attr->next;
122 }
123 }
124 }
125 cur = cur->next;
126 }
127 return(NULL);
128
129 found_content:
130 encoding = xmlStrstr(content, BAD_CAST"charset=");
131 if (encoding == NULL)
132 encoding = xmlStrstr(content, BAD_CAST"Charset=");
133 if (encoding == NULL)
134 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
135 if (encoding != NULL) {
136 encoding += 8;
137 } else {
138 encoding = xmlStrstr(content, BAD_CAST"charset =");
139 if (encoding == NULL)
140 encoding = xmlStrstr(content, BAD_CAST"Charset =");
141 if (encoding == NULL)
142 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
143 if (encoding != NULL)
144 encoding += 9;
145 }
146 if (encoding != NULL) {
147 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
148 }
149 return(encoding);
150 }
151
152 /**
153 * htmlSetMetaEncoding:
154 * @doc: the document
155 * @encoding: the encoding string
156 *
157 * Sets the current encoding in the Meta tags
158 * NOTE: this will not change the document content encoding, just
159 * the META flag associated.
160 *
161 * Returns 0 in case of success and -1 in case of error
162 */
163 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)164 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
165 htmlNodePtr cur, meta = NULL, head = NULL;
166 const xmlChar *content = NULL;
167 char newcontent[100];
168
169 newcontent[0] = 0;
170
171 if (doc == NULL)
172 return(-1);
173
174 /* html isn't a real encoding it's just libxml2 way to get entities */
175 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
176 return(-1);
177
178 if (encoding != NULL) {
179 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
180 (char *)encoding);
181 newcontent[sizeof(newcontent) - 1] = 0;
182 }
183
184 cur = doc->children;
185
186 /*
187 * Search the html
188 */
189 while (cur != NULL) {
190 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
191 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
192 break;
193 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
194 goto found_head;
195 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
196 goto found_meta;
197 }
198 cur = cur->next;
199 }
200 if (cur == NULL)
201 return(-1);
202 cur = cur->children;
203
204 /*
205 * Search the head
206 */
207 while (cur != NULL) {
208 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
209 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
210 break;
211 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
212 head = cur->parent;
213 goto found_meta;
214 }
215 }
216 cur = cur->next;
217 }
218 if (cur == NULL)
219 return(-1);
220 found_head:
221 head = cur;
222 if (cur->children == NULL)
223 goto create;
224 cur = cur->children;
225
226 found_meta:
227 /*
228 * Search and update all the remaining the meta elements carrying
229 * encoding informations
230 */
231 while (cur != NULL) {
232 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
233 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
234 xmlAttrPtr attr = cur->properties;
235 int http;
236 const xmlChar *value;
237
238 content = NULL;
239 http = 0;
240 while (attr != NULL) {
241 if ((attr->children != NULL) &&
242 (attr->children->type == XML_TEXT_NODE) &&
243 (attr->children->next == NULL)) {
244 value = attr->children->content;
245 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
246 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
247 http = 1;
248 else
249 {
250 if ((value != NULL) &&
251 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
252 content = value;
253 }
254 if ((http != 0) && (content != NULL))
255 break;
256 }
257 attr = attr->next;
258 }
259 if ((http != 0) && (content != NULL)) {
260 meta = cur;
261 break;
262 }
263
264 }
265 }
266 cur = cur->next;
267 }
268 create:
269 if (meta == NULL) {
270 if ((encoding != NULL) && (head != NULL)) {
271 /*
272 * Create a new Meta element with the right attributes
273 */
274
275 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
276 if (head->children == NULL)
277 xmlAddChild(head, meta);
278 else
279 xmlAddPrevSibling(head->children, meta);
280 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
281 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
282 }
283 } else {
284 /* remove the meta tag if NULL is passed */
285 if (encoding == NULL) {
286 xmlUnlinkNode(meta);
287 xmlFreeNode(meta);
288 }
289 /* change the document only if there is a real encoding change */
290 else if (xmlStrcasestr(content, encoding) == NULL) {
291 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
292 }
293 }
294
295
296 return(0);
297 }
298
299 /**
300 * booleanHTMLAttrs:
301 *
302 * These are the HTML attributes which will be output
303 * in minimized form, i.e. <option selected="selected"> will be
304 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
305 *
306 */
307 static const char* htmlBooleanAttrs[] = {
308 "checked", "compact", "declare", "defer", "disabled", "ismap",
309 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
310 "selected", NULL
311 };
312
313
314 /**
315 * htmlIsBooleanAttr:
316 * @name: the name of the attribute to check
317 *
318 * Determine if a given attribute is a boolean attribute.
319 *
320 * returns: false if the attribute is not boolean, true otherwise.
321 */
322 int
htmlIsBooleanAttr(const xmlChar * name)323 htmlIsBooleanAttr(const xmlChar *name)
324 {
325 int i = 0;
326
327 while (htmlBooleanAttrs[i] != NULL) {
328 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
329 return 1;
330 i++;
331 }
332 return 0;
333 }
334
335 #ifdef LIBXML_OUTPUT_ENABLED
336 /*
337 * private routine exported from xmlIO.c
338 */
339 xmlOutputBufferPtr
340 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
341 /************************************************************************
342 * *
343 * Output error handlers *
344 * *
345 ************************************************************************/
346 /**
347 * htmlSaveErrMemory:
348 * @extra: extra informations
349 *
350 * Handle an out of memory condition
351 */
352 static void
htmlSaveErrMemory(const char * extra)353 htmlSaveErrMemory(const char *extra)
354 {
355 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
356 }
357
358 /**
359 * htmlSaveErr:
360 * @code: the error number
361 * @node: the location of the error.
362 * @extra: extra informations
363 *
364 * Handle an out of memory condition
365 */
366 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)367 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
368 {
369 const char *msg = NULL;
370
371 switch(code) {
372 case XML_SAVE_NOT_UTF8:
373 msg = "string is not in UTF-8\n";
374 break;
375 case XML_SAVE_CHAR_INVALID:
376 msg = "invalid character value\n";
377 break;
378 case XML_SAVE_UNKNOWN_ENCODING:
379 msg = "unknown encoding %s\n";
380 break;
381 case XML_SAVE_NO_DOCTYPE:
382 msg = "HTML has no DOCTYPE\n";
383 break;
384 default:
385 msg = "unexpected error number\n";
386 }
387 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
388 }
389
390 /************************************************************************
391 * *
392 * Dumping HTML tree content to a simple buffer *
393 * *
394 ************************************************************************/
395
396 /**
397 * htmlBufNodeDumpFormat:
398 * @buf: the xmlBufPtr output
399 * @doc: the document
400 * @cur: the current node
401 * @format: should formatting spaces been added
402 *
403 * Dump an HTML node, recursive behaviour,children are printed too.
404 *
405 * Returns the number of byte written or -1 in case of error
406 */
407 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)408 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
409 int format) {
410 size_t use;
411 int ret;
412 xmlOutputBufferPtr outbuf;
413
414 if (cur == NULL) {
415 return (-1);
416 }
417 if (buf == NULL) {
418 return (-1);
419 }
420 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
421 if (outbuf == NULL) {
422 htmlSaveErrMemory("allocating HTML output buffer");
423 return (-1);
424 }
425 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
426 outbuf->buffer = buf;
427 outbuf->encoder = NULL;
428 outbuf->writecallback = NULL;
429 outbuf->closecallback = NULL;
430 outbuf->context = NULL;
431 outbuf->written = 0;
432
433 use = xmlBufUse(buf);
434 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
435 xmlFree(outbuf);
436 ret = xmlBufUse(buf) - use;
437 return (ret);
438 }
439
440 /**
441 * htmlNodeDump:
442 * @buf: the HTML buffer output
443 * @doc: the document
444 * @cur: the current node
445 *
446 * Dump an HTML node, recursive behaviour,children are printed too,
447 * and formatting returns are added.
448 *
449 * Returns the number of byte written or -1 in case of error
450 */
451 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)452 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
453 xmlBufPtr buffer;
454 size_t ret;
455
456 if ((buf == NULL) || (cur == NULL))
457 return(-1);
458
459 xmlInitParser();
460 buffer = xmlBufFromBuffer(buf);
461 if (buffer == NULL)
462 return(-1);
463
464 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
465
466 xmlBufBackToBuffer(buffer);
467
468 if (ret > INT_MAX)
469 return(-1);
470 return((int) ret);
471 }
472
473 /**
474 * htmlNodeDumpFileFormat:
475 * @out: the FILE pointer
476 * @doc: the document
477 * @cur: the current node
478 * @encoding: the document encoding
479 * @format: should formatting spaces been added
480 *
481 * Dump an HTML node, recursive behaviour,children are printed too.
482 *
483 * TODO: if encoding == NULL try to save in the doc encoding
484 *
485 * returns: the number of byte written or -1 in case of failure.
486 */
487 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)488 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
489 xmlNodePtr cur, const char *encoding, int format) {
490 xmlOutputBufferPtr buf;
491 xmlCharEncodingHandlerPtr handler = NULL;
492 int ret;
493
494 xmlInitParser();
495
496 if (encoding != NULL) {
497 xmlCharEncoding enc;
498
499 enc = xmlParseCharEncoding(encoding);
500 if (enc != XML_CHAR_ENCODING_UTF8) {
501 handler = xmlFindCharEncodingHandler(encoding);
502 if (handler == NULL)
503 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
504 }
505 } else {
506 /*
507 * Fallback to HTML or ASCII when the encoding is unspecified
508 */
509 if (handler == NULL)
510 handler = xmlFindCharEncodingHandler("HTML");
511 if (handler == NULL)
512 handler = xmlFindCharEncodingHandler("ascii");
513 }
514
515 /*
516 * save the content to a temp buffer.
517 */
518 buf = xmlOutputBufferCreateFile(out, handler);
519 if (buf == NULL) return(0);
520
521 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
522
523 ret = xmlOutputBufferClose(buf);
524 return(ret);
525 }
526
527 /**
528 * htmlNodeDumpFile:
529 * @out: the FILE pointer
530 * @doc: the document
531 * @cur: the current node
532 *
533 * Dump an HTML node, recursive behaviour,children are printed too,
534 * and formatting returns are added.
535 */
536 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
538 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
539 }
540
541 /**
542 * htmlDocDumpMemoryFormat:
543 * @cur: the document
544 * @mem: OUT: the memory pointer
545 * @size: OUT: the memory length
546 * @format: should formatting spaces been added
547 *
548 * Dump an HTML document in memory and return the xmlChar * and it's size.
549 * It's up to the caller to free the memory.
550 */
551 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
553 xmlOutputBufferPtr buf;
554 xmlCharEncodingHandlerPtr handler = NULL;
555 const char *encoding;
556
557 xmlInitParser();
558
559 if ((mem == NULL) || (size == NULL))
560 return;
561 if (cur == NULL) {
562 *mem = NULL;
563 *size = 0;
564 return;
565 }
566
567 encoding = (const char *) htmlGetMetaEncoding(cur);
568
569 if (encoding != NULL) {
570 xmlCharEncoding enc;
571
572 enc = xmlParseCharEncoding(encoding);
573 if (enc != XML_CHAR_ENCODING_UTF8) {
574 handler = xmlFindCharEncodingHandler(encoding);
575 if (handler == NULL)
576 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
577
578 }
579 } else {
580 /*
581 * Fallback to HTML or ASCII when the encoding is unspecified
582 */
583 if (handler == NULL)
584 handler = xmlFindCharEncodingHandler("HTML");
585 if (handler == NULL)
586 handler = xmlFindCharEncodingHandler("ascii");
587 }
588
589 buf = xmlAllocOutputBufferInternal(handler);
590 if (buf == NULL) {
591 *mem = NULL;
592 *size = 0;
593 return;
594 }
595
596 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
597
598 xmlOutputBufferFlush(buf);
599 if (buf->conv != NULL) {
600 *size = xmlBufUse(buf->conv);
601 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
602 } else {
603 *size = xmlBufUse(buf->buffer);
604 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
605 }
606 (void)xmlOutputBufferClose(buf);
607 }
608
609 /**
610 * htmlDocDumpMemory:
611 * @cur: the document
612 * @mem: OUT: the memory pointer
613 * @size: OUT: the memory length
614 *
615 * Dump an HTML document in memory and return the xmlChar * and it's size.
616 * It's up to the caller to free the memory.
617 */
618 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)619 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
620 htmlDocDumpMemoryFormat(cur, mem, size, 1);
621 }
622
623
624 /************************************************************************
625 * *
626 * Dumping HTML tree content to an I/O output buffer *
627 * *
628 ************************************************************************/
629
630 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
631
632 /**
633 * htmlDtdDumpOutput:
634 * @buf: the HTML buffer output
635 * @doc: the document
636 * @encoding: the encoding string
637 *
638 * TODO: check whether encoding is needed
639 *
640 * Dump the HTML document DTD, if any.
641 */
642 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)643 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
644 const char *encoding ATTRIBUTE_UNUSED) {
645 xmlDtdPtr cur = doc->intSubset;
646
647 if (cur == NULL) {
648 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
649 return;
650 }
651 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
652 xmlOutputBufferWriteString(buf, (const char *)cur->name);
653 if (cur->ExternalID != NULL) {
654 xmlOutputBufferWriteString(buf, " PUBLIC ");
655 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
656 if (cur->SystemID != NULL) {
657 xmlOutputBufferWriteString(buf, " ");
658 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
659 }
660 } else if (cur->SystemID != NULL &&
661 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
662 xmlOutputBufferWriteString(buf, " SYSTEM ");
663 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
664 }
665 xmlOutputBufferWriteString(buf, ">\n");
666 }
667
668 /**
669 * htmlAttrDumpOutput:
670 * @buf: the HTML buffer output
671 * @doc: the document
672 * @cur: the attribute pointer
673 * @encoding: the encoding string
674 *
675 * Dump an HTML attribute
676 */
677 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)678 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
679 const char *encoding ATTRIBUTE_UNUSED) {
680 xmlChar *value;
681
682 /*
683 * The html output method should not escape a & character
684 * occurring in an attribute value immediately followed by
685 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
686 * This is implemented in xmlEncodeEntitiesReentrant
687 */
688
689 if (cur == NULL) {
690 return;
691 }
692 xmlOutputBufferWriteString(buf, " ");
693 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
694 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
695 xmlOutputBufferWriteString(buf, ":");
696 }
697 xmlOutputBufferWriteString(buf, (const char *)cur->name);
698 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
699 value = xmlNodeListGetString(doc, cur->children, 0);
700 if (value) {
701 xmlOutputBufferWriteString(buf, "=");
702 if ((cur->ns == NULL) && (cur->parent != NULL) &&
703 (cur->parent->ns == NULL) &&
704 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
705 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
706 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
707 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
708 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
709 xmlChar *escaped;
710 xmlChar *tmp = value;
711
712 while (IS_BLANK_CH(*tmp)) tmp++;
713
714 /*
715 * the < and > have already been escaped at the entity level
716 * And doing so here breaks server side includes
717 */
718 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
719 if (escaped != NULL) {
720 xmlBufWriteQuotedString(buf->buffer, escaped);
721 xmlFree(escaped);
722 } else {
723 xmlBufWriteQuotedString(buf->buffer, value);
724 }
725 } else {
726 xmlBufWriteQuotedString(buf->buffer, value);
727 }
728 xmlFree(value);
729 } else {
730 xmlOutputBufferWriteString(buf, "=\"\"");
731 }
732 }
733 }
734
735 /**
736 * htmlAttrListDumpOutput:
737 * @buf: the HTML buffer output
738 * @doc: the document
739 * @cur: the first attribute pointer
740 * @encoding: the encoding string
741 *
742 * Dump a list of HTML attributes
743 */
744 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)745 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
746 if (cur == NULL) {
747 return;
748 }
749 while (cur != NULL) {
750 htmlAttrDumpOutput(buf, doc, cur, encoding);
751 cur = cur->next;
752 }
753 }
754
755
756
757 /**
758 * htmlNodeListDumpOutput:
759 * @buf: the HTML buffer output
760 * @doc: the document
761 * @cur: the first node
762 * @encoding: the encoding string
763 * @format: should formatting spaces been added
764 *
765 * Dump an HTML node list, recursive behaviour,children are printed too.
766 */
767 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)768 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
769 xmlNodePtr cur, const char *encoding, int format) {
770 if (cur == NULL) {
771 return;
772 }
773 while (cur != NULL) {
774 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
775 cur = cur->next;
776 }
777 }
778
779 /**
780 * htmlNodeDumpFormatOutput:
781 * @buf: the HTML buffer output
782 * @doc: the document
783 * @cur: the current node
784 * @encoding: the encoding string
785 * @format: should formatting spaces been added
786 *
787 * Dump an HTML node, recursive behaviour,children are printed too.
788 */
789 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)790 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
791 xmlNodePtr cur, const char *encoding, int format) {
792 const htmlElemDesc * info;
793
794 xmlInitParser();
795
796 if ((cur == NULL) || (buf == NULL)) {
797 return;
798 }
799 /*
800 * Special cases.
801 */
802 if (cur->type == XML_DTD_NODE)
803 return;
804 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
805 (cur->type == XML_DOCUMENT_NODE)){
806 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
807 return;
808 }
809 if (cur->type == XML_ATTRIBUTE_NODE) {
810 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
811 return;
812 }
813 if (cur->type == HTML_TEXT_NODE) {
814 if (cur->content != NULL) {
815 if (((cur->name == (const xmlChar *)xmlStringText) ||
816 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
817 ((cur->parent == NULL) ||
818 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
819 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
820 xmlChar *buffer;
821
822 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
823 if (buffer != NULL) {
824 xmlOutputBufferWriteString(buf, (const char *)buffer);
825 xmlFree(buffer);
826 }
827 } else {
828 xmlOutputBufferWriteString(buf, (const char *)cur->content);
829 }
830 }
831 return;
832 }
833 if (cur->type == HTML_COMMENT_NODE) {
834 if (cur->content != NULL) {
835 xmlOutputBufferWriteString(buf, "<!--");
836 xmlOutputBufferWriteString(buf, (const char *)cur->content);
837 xmlOutputBufferWriteString(buf, "-->");
838 }
839 return;
840 }
841 if (cur->type == HTML_PI_NODE) {
842 if (cur->name == NULL)
843 return;
844 xmlOutputBufferWriteString(buf, "<?");
845 xmlOutputBufferWriteString(buf, (const char *)cur->name);
846 if (cur->content != NULL) {
847 xmlOutputBufferWriteString(buf, " ");
848 xmlOutputBufferWriteString(buf, (const char *)cur->content);
849 }
850 xmlOutputBufferWriteString(buf, ">");
851 return;
852 }
853 if (cur->type == HTML_ENTITY_REF_NODE) {
854 xmlOutputBufferWriteString(buf, "&");
855 xmlOutputBufferWriteString(buf, (const char *)cur->name);
856 xmlOutputBufferWriteString(buf, ";");
857 return;
858 }
859 if (cur->type == HTML_PRESERVE_NODE) {
860 if (cur->content != NULL) {
861 xmlOutputBufferWriteString(buf, (const char *)cur->content);
862 }
863 return;
864 }
865
866 /*
867 * Get specific HTML info for that node.
868 */
869 if (cur->ns == NULL)
870 info = htmlTagLookup(cur->name);
871 else
872 info = NULL;
873
874 xmlOutputBufferWriteString(buf, "<");
875 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
876 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
877 xmlOutputBufferWriteString(buf, ":");
878 }
879 xmlOutputBufferWriteString(buf, (const char *)cur->name);
880 if (cur->nsDef)
881 xmlNsListDumpOutput(buf, cur->nsDef);
882 if (cur->properties != NULL)
883 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
884
885 if ((info != NULL) && (info->empty)) {
886 xmlOutputBufferWriteString(buf, ">");
887 if ((format) && (!info->isinline) && (cur->next != NULL)) {
888 if ((cur->next->type != HTML_TEXT_NODE) &&
889 (cur->next->type != HTML_ENTITY_REF_NODE) &&
890 (cur->parent != NULL) &&
891 (cur->parent->name != NULL) &&
892 (cur->parent->name[0] != 'p')) /* p, pre, param */
893 xmlOutputBufferWriteString(buf, "\n");
894 }
895 return;
896 }
897 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
898 (cur->children == NULL)) {
899 if ((info != NULL) && (info->saveEndTag != 0) &&
900 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
901 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
902 xmlOutputBufferWriteString(buf, ">");
903 } else {
904 xmlOutputBufferWriteString(buf, "></");
905 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
906 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
907 xmlOutputBufferWriteString(buf, ":");
908 }
909 xmlOutputBufferWriteString(buf, (const char *)cur->name);
910 xmlOutputBufferWriteString(buf, ">");
911 }
912 if ((format) && (cur->next != NULL) &&
913 (info != NULL) && (!info->isinline)) {
914 if ((cur->next->type != HTML_TEXT_NODE) &&
915 (cur->next->type != HTML_ENTITY_REF_NODE) &&
916 (cur->parent != NULL) &&
917 (cur->parent->name != NULL) &&
918 (cur->parent->name[0] != 'p')) /* p, pre, param */
919 xmlOutputBufferWriteString(buf, "\n");
920 }
921 return;
922 }
923 xmlOutputBufferWriteString(buf, ">");
924 if ((cur->type != XML_ELEMENT_NODE) &&
925 (cur->content != NULL)) {
926 /*
927 * Uses the OutputBuffer property to automatically convert
928 * invalids to charrefs
929 */
930
931 xmlOutputBufferWriteString(buf, (const char *) cur->content);
932 }
933 if (cur->children != NULL) {
934 if ((format) && (info != NULL) && (!info->isinline) &&
935 (cur->children->type != HTML_TEXT_NODE) &&
936 (cur->children->type != HTML_ENTITY_REF_NODE) &&
937 (cur->children != cur->last) &&
938 (cur->name != NULL) &&
939 (cur->name[0] != 'p')) /* p, pre, param */
940 xmlOutputBufferWriteString(buf, "\n");
941 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
942 if ((format) && (info != NULL) && (!info->isinline) &&
943 (cur->last->type != HTML_TEXT_NODE) &&
944 (cur->last->type != HTML_ENTITY_REF_NODE) &&
945 (cur->children != cur->last) &&
946 (cur->name != NULL) &&
947 (cur->name[0] != 'p')) /* p, pre, param */
948 xmlOutputBufferWriteString(buf, "\n");
949 }
950 xmlOutputBufferWriteString(buf, "</");
951 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
952 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
953 xmlOutputBufferWriteString(buf, ":");
954 }
955 xmlOutputBufferWriteString(buf, (const char *)cur->name);
956 xmlOutputBufferWriteString(buf, ">");
957 if ((format) && (info != NULL) && (!info->isinline) &&
958 (cur->next != NULL)) {
959 if ((cur->next->type != HTML_TEXT_NODE) &&
960 (cur->next->type != HTML_ENTITY_REF_NODE) &&
961 (cur->parent != NULL) &&
962 (cur->parent->name != NULL) &&
963 (cur->parent->name[0] != 'p')) /* p, pre, param */
964 xmlOutputBufferWriteString(buf, "\n");
965 }
966 }
967
968 /**
969 * htmlNodeDumpOutput:
970 * @buf: the HTML buffer output
971 * @doc: the document
972 * @cur: the current node
973 * @encoding: the encoding string
974 *
975 * Dump an HTML node, recursive behaviour,children are printed too,
976 * and formatting returns/spaces are added.
977 */
978 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)979 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
980 xmlNodePtr cur, const char *encoding) {
981 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
982 }
983
984 /**
985 * htmlDocContentDumpFormatOutput:
986 * @buf: the HTML buffer output
987 * @cur: the document
988 * @encoding: the encoding string
989 * @format: should formatting spaces been added
990 *
991 * Dump an HTML document.
992 */
993 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)994 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
995 const char *encoding, int format) {
996 int type;
997
998 xmlInitParser();
999
1000 if ((buf == NULL) || (cur == NULL))
1001 return;
1002
1003 /*
1004 * force to output the stuff as HTML, especially for entities
1005 */
1006 type = cur->type;
1007 cur->type = XML_HTML_DOCUMENT_NODE;
1008 if (cur->intSubset != NULL) {
1009 htmlDtdDumpOutput(buf, cur, NULL);
1010 }
1011 if (cur->children != NULL) {
1012 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1013 }
1014 xmlOutputBufferWriteString(buf, "\n");
1015 cur->type = (xmlElementType) type;
1016 }
1017
1018 /**
1019 * htmlDocContentDumpOutput:
1020 * @buf: the HTML buffer output
1021 * @cur: the document
1022 * @encoding: the encoding string
1023 *
1024 * Dump an HTML document. Formatting return/spaces are added.
1025 */
1026 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1027 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1028 const char *encoding) {
1029 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1030 }
1031
1032 /************************************************************************
1033 * *
1034 * Saving functions front-ends *
1035 * *
1036 ************************************************************************/
1037
1038 /**
1039 * htmlDocDump:
1040 * @f: the FILE*
1041 * @cur: the document
1042 *
1043 * Dump an HTML document to an open FILE.
1044 *
1045 * returns: the number of byte written or -1 in case of failure.
1046 */
1047 int
htmlDocDump(FILE * f,xmlDocPtr cur)1048 htmlDocDump(FILE *f, xmlDocPtr cur) {
1049 xmlOutputBufferPtr buf;
1050 xmlCharEncodingHandlerPtr handler = NULL;
1051 const char *encoding;
1052 int ret;
1053
1054 xmlInitParser();
1055
1056 if ((cur == NULL) || (f == NULL)) {
1057 return(-1);
1058 }
1059
1060 encoding = (const char *) htmlGetMetaEncoding(cur);
1061
1062 if (encoding != NULL) {
1063 xmlCharEncoding enc;
1064
1065 enc = xmlParseCharEncoding(encoding);
1066 if (enc != XML_CHAR_ENCODING_UTF8) {
1067 handler = xmlFindCharEncodingHandler(encoding);
1068 if (handler == NULL)
1069 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1070 }
1071 } else {
1072 /*
1073 * Fallback to HTML or ASCII when the encoding is unspecified
1074 */
1075 if (handler == NULL)
1076 handler = xmlFindCharEncodingHandler("HTML");
1077 if (handler == NULL)
1078 handler = xmlFindCharEncodingHandler("ascii");
1079 }
1080
1081 buf = xmlOutputBufferCreateFile(f, handler);
1082 if (buf == NULL) return(-1);
1083 htmlDocContentDumpOutput(buf, cur, NULL);
1084
1085 ret = xmlOutputBufferClose(buf);
1086 return(ret);
1087 }
1088
1089 /**
1090 * htmlSaveFile:
1091 * @filename: the filename (or URL)
1092 * @cur: the document
1093 *
1094 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1095 * used.
1096 * returns: the number of byte written or -1 in case of failure.
1097 */
1098 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1099 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1100 xmlOutputBufferPtr buf;
1101 xmlCharEncodingHandlerPtr handler = NULL;
1102 const char *encoding;
1103 int ret;
1104
1105 if ((cur == NULL) || (filename == NULL))
1106 return(-1);
1107
1108 xmlInitParser();
1109
1110 encoding = (const char *) htmlGetMetaEncoding(cur);
1111
1112 if (encoding != NULL) {
1113 xmlCharEncoding enc;
1114
1115 enc = xmlParseCharEncoding(encoding);
1116 if (enc != XML_CHAR_ENCODING_UTF8) {
1117 handler = xmlFindCharEncodingHandler(encoding);
1118 if (handler == NULL)
1119 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1120 }
1121 } else {
1122 /*
1123 * Fallback to HTML or ASCII when the encoding is unspecified
1124 */
1125 if (handler == NULL)
1126 handler = xmlFindCharEncodingHandler("HTML");
1127 if (handler == NULL)
1128 handler = xmlFindCharEncodingHandler("ascii");
1129 }
1130
1131 /*
1132 * save the content to a temp buffer.
1133 */
1134 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1135 if (buf == NULL) return(0);
1136
1137 htmlDocContentDumpOutput(buf, cur, NULL);
1138
1139 ret = xmlOutputBufferClose(buf);
1140 return(ret);
1141 }
1142
1143 /**
1144 * htmlSaveFileFormat:
1145 * @filename: the filename
1146 * @cur: the document
1147 * @format: should formatting spaces been added
1148 * @encoding: the document encoding
1149 *
1150 * Dump an HTML document to a file using a given encoding.
1151 *
1152 * returns: the number of byte written or -1 in case of failure.
1153 */
1154 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1155 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1156 const char *encoding, int format) {
1157 xmlOutputBufferPtr buf;
1158 xmlCharEncodingHandlerPtr handler = NULL;
1159 int ret;
1160
1161 if ((cur == NULL) || (filename == NULL))
1162 return(-1);
1163
1164 xmlInitParser();
1165
1166 if (encoding != NULL) {
1167 xmlCharEncoding enc;
1168
1169 enc = xmlParseCharEncoding(encoding);
1170 if (enc != XML_CHAR_ENCODING_UTF8) {
1171 handler = xmlFindCharEncodingHandler(encoding);
1172 if (handler == NULL)
1173 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1174 }
1175 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1176 } else {
1177 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1178
1179 /*
1180 * Fallback to HTML or ASCII when the encoding is unspecified
1181 */
1182 if (handler == NULL)
1183 handler = xmlFindCharEncodingHandler("HTML");
1184 if (handler == NULL)
1185 handler = xmlFindCharEncodingHandler("ascii");
1186 }
1187
1188 /*
1189 * save the content to a temp buffer.
1190 */
1191 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1192 if (buf == NULL) return(0);
1193
1194 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1195
1196 ret = xmlOutputBufferClose(buf);
1197 return(ret);
1198 }
1199
1200 /**
1201 * htmlSaveFileEnc:
1202 * @filename: the filename
1203 * @cur: the document
1204 * @encoding: the document encoding
1205 *
1206 * Dump an HTML document to a file using a given encoding
1207 * and formatting returns/spaces are added.
1208 *
1209 * returns: the number of byte written or -1 in case of failure.
1210 */
1211 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1212 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1213 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1214 }
1215
1216 #endif /* LIBXML_OUTPUT_ENABLED */
1217
1218 #define bottom_HTMLtree
1219 #include "elfgcchack.h"
1220 #endif /* LIBXML_HTML_ENABLED */
1221