1 /*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/save.h"
30
31 /************************************************************************
32 * *
33 * Getting/Setting encoding meta tags *
34 * *
35 ************************************************************************/
36
37 /**
38 * htmlGetMetaEncoding:
39 * @doc: the document
40 *
41 * Encoding definition lookup in the Meta tags
42 *
43 * Returns the current encoding as flagged in the HTML source
44 */
45 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)46 htmlGetMetaEncoding(htmlDocPtr doc) {
47 htmlNodePtr cur;
48 const xmlChar *content;
49 const xmlChar *encoding;
50
51 if (doc == NULL)
52 return(NULL);
53 cur = doc->children;
54
55 /*
56 * Search the html
57 */
58 while (cur != NULL) {
59 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 break;
62 if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 goto found_head;
64 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 goto found_meta;
66 }
67 cur = cur->next;
68 }
69 if (cur == NULL)
70 return(NULL);
71 cur = cur->children;
72
73 /*
74 * Search the head
75 */
76 while (cur != NULL) {
77 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 break;
80 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 goto found_meta;
82 }
83 cur = cur->next;
84 }
85 if (cur == NULL)
86 return(NULL);
87 found_head:
88 cur = cur->children;
89
90 /*
91 * Search the meta elements
92 */
93 found_meta:
94 while (cur != NULL) {
95 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 xmlAttrPtr attr = cur->properties;
98 int http;
99 const xmlChar *value;
100
101 content = NULL;
102 http = 0;
103 while (attr != NULL) {
104 if ((attr->children != NULL) &&
105 (attr->children->type == XML_TEXT_NODE) &&
106 (attr->children->next == NULL)) {
107 value = attr->children->content;
108 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 http = 1;
111 else if ((value != NULL)
112 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 content = value;
114 if ((http != 0) && (content != NULL))
115 goto found_content;
116 }
117 attr = attr->next;
118 }
119 }
120 }
121 cur = cur->next;
122 }
123 return(NULL);
124
125 found_content:
126 encoding = xmlStrstr(content, BAD_CAST"charset=");
127 if (encoding == NULL)
128 encoding = xmlStrstr(content, BAD_CAST"Charset=");
129 if (encoding == NULL)
130 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131 if (encoding != NULL) {
132 encoding += 8;
133 } else {
134 encoding = xmlStrstr(content, BAD_CAST"charset =");
135 if (encoding == NULL)
136 encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 if (encoding == NULL)
138 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 if (encoding != NULL)
140 encoding += 9;
141 }
142 if (encoding != NULL) {
143 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144 }
145 return(encoding);
146 }
147
148 /**
149 * htmlSetMetaEncoding:
150 * @doc: the document
151 * @encoding: the encoding string
152 *
153 * Sets the current encoding in the Meta tags
154 * NOTE: this will not change the document content encoding, just
155 * the META flag associated.
156 *
157 * Returns 0 in case of success and -1 in case of error
158 */
159 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)160 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161 htmlNodePtr cur, meta = NULL, head = NULL;
162 const xmlChar *content = NULL;
163 char newcontent[100];
164
165 newcontent[0] = 0;
166
167 if (doc == NULL)
168 return(-1);
169
170 /* html isn't a real encoding it's just libxml2 way to get entities */
171 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172 return(-1);
173
174 if (encoding != NULL) {
175 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176 (char *)encoding);
177 newcontent[sizeof(newcontent) - 1] = 0;
178 }
179
180 cur = doc->children;
181
182 /*
183 * Search the html
184 */
185 while (cur != NULL) {
186 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 break;
189 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 goto found_head;
191 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 goto found_meta;
193 }
194 cur = cur->next;
195 }
196 if (cur == NULL)
197 return(-1);
198 cur = cur->children;
199
200 /*
201 * Search the head
202 */
203 while (cur != NULL) {
204 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 break;
207 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208 head = cur->parent;
209 goto found_meta;
210 }
211 }
212 cur = cur->next;
213 }
214 if (cur == NULL)
215 return(-1);
216 found_head:
217 head = cur;
218 if (cur->children == NULL)
219 goto create;
220 cur = cur->children;
221
222 found_meta:
223 /*
224 * Search and update all the remaining the meta elements carrying
225 * encoding information
226 */
227 while (cur != NULL) {
228 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 xmlAttrPtr attr = cur->properties;
231 int http;
232 const xmlChar *value;
233
234 content = NULL;
235 http = 0;
236 while (attr != NULL) {
237 if ((attr->children != NULL) &&
238 (attr->children->type == XML_TEXT_NODE) &&
239 (attr->children->next == NULL)) {
240 value = attr->children->content;
241 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 http = 1;
244 else
245 {
246 if ((value != NULL) &&
247 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 content = value;
249 }
250 if ((http != 0) && (content != NULL))
251 break;
252 }
253 attr = attr->next;
254 }
255 if ((http != 0) && (content != NULL)) {
256 meta = cur;
257 break;
258 }
259
260 }
261 }
262 cur = cur->next;
263 }
264 create:
265 if (meta == NULL) {
266 if ((encoding != NULL) && (head != NULL)) {
267 /*
268 * Create a new Meta element with the right attributes
269 */
270
271 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272 if (head->children == NULL)
273 xmlAddChild(head, meta);
274 else
275 xmlAddPrevSibling(head->children, meta);
276 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278 }
279 } else {
280 /* remove the meta tag if NULL is passed */
281 if (encoding == NULL) {
282 xmlUnlinkNode(meta);
283 xmlFreeNode(meta);
284 }
285 /* change the document only if there is a real encoding change */
286 else if (xmlStrcasestr(content, encoding) == NULL) {
287 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288 }
289 }
290
291
292 return(0);
293 }
294
295 /**
296 * booleanHTMLAttrs:
297 *
298 * These are the HTML attributes which will be output
299 * in minimized form, i.e. <option selected="selected"> will be
300 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301 *
302 */
303 static const char* const htmlBooleanAttrs[] = {
304 "checked", "compact", "declare", "defer", "disabled", "ismap",
305 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306 "selected", NULL
307 };
308
309
310 /**
311 * htmlIsBooleanAttr:
312 * @name: the name of the attribute to check
313 *
314 * Determine if a given attribute is a boolean attribute.
315 *
316 * returns: false if the attribute is not boolean, true otherwise.
317 */
318 int
htmlIsBooleanAttr(const xmlChar * name)319 htmlIsBooleanAttr(const xmlChar *name)
320 {
321 int i = 0;
322
323 while (htmlBooleanAttrs[i] != NULL) {
324 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325 return 1;
326 i++;
327 }
328 return 0;
329 }
330
331 #ifdef LIBXML_OUTPUT_ENABLED
332 /************************************************************************
333 * *
334 * Output error handlers *
335 * *
336 ************************************************************************/
337
338 /**
339 * htmlSaveErr:
340 * @code: the error number
341 * @node: the location of the error.
342 * @extra: extra information
343 *
344 * Handle an out of memory condition
345 */
346 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)347 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
348 {
349 const char *msg = NULL;
350 int res;
351
352 switch(code) {
353 case XML_SAVE_NOT_UTF8:
354 msg = "string is not in UTF-8\n";
355 break;
356 case XML_SAVE_CHAR_INVALID:
357 msg = "invalid character value\n";
358 break;
359 case XML_SAVE_UNKNOWN_ENCODING:
360 msg = "unknown encoding %s\n";
361 break;
362 case XML_SAVE_NO_DOCTYPE:
363 msg = "HTML has no DOCTYPE\n";
364 break;
365 default:
366 msg = "unexpected error number\n";
367 }
368
369 res = __xmlRaiseError(NULL, NULL, NULL, NULL, node,
370 XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
371 extra, NULL, NULL, 0, 0,
372 msg, extra);
373 if (res < 0)
374 xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
375 }
376
377 /************************************************************************
378 * *
379 * Dumping HTML tree content to a simple buffer *
380 * *
381 ************************************************************************/
382
383 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)384 htmlFindOutputEncoder(const char *encoding) {
385 xmlCharEncodingHandler *handler = NULL;
386
387 if (encoding != NULL) {
388 xmlCharEncoding enc;
389
390 enc = xmlParseCharEncoding(encoding);
391 if (enc != XML_CHAR_ENCODING_UTF8) {
392 xmlOpenCharEncodingHandler(encoding, /* output */ 1, &handler);
393 if (handler == NULL)
394 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
395 }
396 } else {
397 /*
398 * Fallback to HTML or ASCII when the encoding is unspecified
399 */
400 if (handler == NULL)
401 xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
402 if (handler == NULL)
403 xmlOpenCharEncodingHandler("ascii", /* output */ 1, &handler);
404 }
405
406 return(handler);
407 }
408
409 /**
410 * htmlBufNodeDumpFormat:
411 * @buf: the xmlBufPtr output
412 * @doc: the document
413 * @cur: the current node
414 * @format: should formatting spaces been added
415 *
416 * Dump an HTML node, recursive behaviour,children are printed too.
417 *
418 * Returns the number of byte written or -1 in case of error
419 */
420 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)421 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
422 int format) {
423 size_t use;
424 size_t ret;
425 xmlOutputBufferPtr outbuf;
426
427 if (cur == NULL) {
428 return ((size_t) -1);
429 }
430 if (buf == NULL) {
431 return ((size_t) -1);
432 }
433 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
434 if (outbuf == NULL)
435 return ((size_t) -1);
436 memset(outbuf, 0, sizeof(xmlOutputBuffer));
437 outbuf->buffer = buf;
438 outbuf->encoder = NULL;
439 outbuf->writecallback = NULL;
440 outbuf->closecallback = NULL;
441 outbuf->context = NULL;
442 outbuf->written = 0;
443
444 use = xmlBufUse(buf);
445 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
446 if (outbuf->error)
447 ret = (size_t) -1;
448 else
449 ret = xmlBufUse(buf) - use;
450 xmlFree(outbuf);
451 return (ret);
452 }
453
454 /**
455 * htmlNodeDump:
456 * @buf: the HTML buffer output
457 * @doc: the document
458 * @cur: the current node
459 *
460 * Dump an HTML node, recursive behaviour,children are printed too,
461 * and formatting returns are added.
462 *
463 * Returns the number of byte written or -1 in case of error
464 */
465 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)466 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
467 xmlBufPtr buffer;
468 size_t ret;
469
470 if ((buf == NULL) || (cur == NULL))
471 return(-1);
472
473 xmlInitParser();
474 buffer = xmlBufFromBuffer(buf);
475 if (buffer == NULL)
476 return(-1);
477
478 xmlBufSetAllocationScheme(buffer, XML_BUFFER_ALLOC_DOUBLEIT);
479 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
480
481 xmlBufBackToBuffer(buffer);
482
483 if (ret > INT_MAX)
484 return(-1);
485 return((int) ret);
486 }
487
488 /**
489 * htmlNodeDumpFileFormat:
490 * @out: the FILE pointer
491 * @doc: the document
492 * @cur: the current node
493 * @encoding: the document encoding
494 * @format: should formatting spaces been added
495 *
496 * Dump an HTML node, recursive behaviour,children are printed too.
497 *
498 * TODO: if encoding == NULL try to save in the doc encoding
499 *
500 * returns: the number of byte written or -1 in case of failure.
501 */
502 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)503 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
504 xmlNodePtr cur, const char *encoding, int format) {
505 xmlOutputBufferPtr buf;
506 xmlCharEncodingHandlerPtr handler;
507 int ret;
508
509 xmlInitParser();
510
511 /*
512 * save the content to a temp buffer.
513 */
514 handler = htmlFindOutputEncoder(encoding);
515 buf = xmlOutputBufferCreateFile(out, handler);
516 if (buf == NULL) {
517 xmlCharEncCloseFunc(handler);
518 return(0);
519 }
520
521 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
522
523 ret = xmlOutputBufferClose(buf);
524 return(ret);
525 }
526
527 /**
528 * htmlNodeDumpFile:
529 * @out: the FILE pointer
530 * @doc: the document
531 * @cur: the current node
532 *
533 * Dump an HTML node, recursive behaviour,children are printed too,
534 * and formatting returns are added.
535 */
536 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
538 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
539 }
540
541 /**
542 * htmlDocDumpMemoryFormat:
543 * @cur: the document
544 * @mem: OUT: the memory pointer
545 * @size: OUT: the memory length
546 * @format: should formatting spaces been added
547 *
548 * Dump an HTML document in memory and return the xmlChar * and it's size.
549 * It's up to the caller to free the memory.
550 */
551 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
553 xmlOutputBufferPtr buf;
554 xmlCharEncodingHandlerPtr handler = NULL;
555 const char *encoding;
556
557 xmlInitParser();
558
559 if ((mem == NULL) || (size == NULL))
560 return;
561 *mem = NULL;
562 *size = 0;
563 if (cur == NULL)
564 return;
565
566 encoding = (const char *) htmlGetMetaEncoding(cur);
567 handler = htmlFindOutputEncoder(encoding);
568 buf = xmlAllocOutputBufferInternal(handler);
569 if (buf == NULL) {
570 xmlCharEncCloseFunc(handler);
571 return;
572 }
573
574 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
575
576 xmlOutputBufferFlush(buf);
577
578 if (!buf->error) {
579 if (buf->conv != NULL) {
580 *size = xmlBufUse(buf->conv);
581 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
582 } else {
583 *size = xmlBufUse(buf->buffer);
584 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
585 }
586 }
587
588 xmlOutputBufferClose(buf);
589 }
590
591 /**
592 * htmlDocDumpMemory:
593 * @cur: the document
594 * @mem: OUT: the memory pointer
595 * @size: OUT: the memory length
596 *
597 * Dump an HTML document in memory and return the xmlChar * and it's size.
598 * It's up to the caller to free the memory.
599 */
600 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)601 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
602 htmlDocDumpMemoryFormat(cur, mem, size, 1);
603 }
604
605
606 /************************************************************************
607 * *
608 * Dumping HTML tree content to an I/O output buffer *
609 * *
610 ************************************************************************/
611
612 /**
613 * htmlDtdDumpOutput:
614 * @buf: the HTML buffer output
615 * @doc: the document
616 * @encoding: the encoding string
617 *
618 * TODO: check whether encoding is needed
619 *
620 * Dump the HTML document DTD, if any.
621 */
622 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)623 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
624 const char *encoding ATTRIBUTE_UNUSED) {
625 xmlDtdPtr cur = doc->intSubset;
626
627 if (cur == NULL) {
628 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
629 return;
630 }
631 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
632 xmlOutputBufferWriteString(buf, (const char *)cur->name);
633 if (cur->ExternalID != NULL) {
634 xmlOutputBufferWriteString(buf, " PUBLIC ");
635 xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
636 if (cur->SystemID != NULL) {
637 xmlOutputBufferWriteString(buf, " ");
638 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
639 }
640 } else if (cur->SystemID != NULL &&
641 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
642 xmlOutputBufferWriteString(buf, " SYSTEM ");
643 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
644 }
645 xmlOutputBufferWriteString(buf, ">\n");
646 }
647
648 /**
649 * htmlAttrDumpOutput:
650 * @buf: the HTML buffer output
651 * @doc: the document
652 * @cur: the attribute pointer
653 *
654 * Dump an HTML attribute
655 */
656 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)657 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
658 xmlChar *value;
659
660 /*
661 * The html output method should not escape a & character
662 * occurring in an attribute value immediately followed by
663 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
664 * This is implemented in xmlEncodeEntitiesReentrant
665 */
666
667 if (cur == NULL) {
668 return;
669 }
670 xmlOutputBufferWriteString(buf, " ");
671 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
672 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
673 xmlOutputBufferWriteString(buf, ":");
674 }
675 xmlOutputBufferWriteString(buf, (const char *)cur->name);
676 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
677 value = xmlNodeListGetString(doc, cur->children, 0);
678 if (value) {
679 xmlOutputBufferWriteString(buf, "=");
680 if ((cur->ns == NULL) && (cur->parent != NULL) &&
681 (cur->parent->ns == NULL) &&
682 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
683 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
684 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
685 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
686 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
687 xmlChar *escaped;
688 xmlChar *tmp = value;
689
690 while (IS_BLANK_CH(*tmp)) tmp++;
691
692 /*
693 * Angle brackets are technically illegal in URIs, but they're
694 * used in server side includes, for example. Curly brackets
695 * are illegal as well and often used in templates.
696 * Don't escape non-whitespace, printable ASCII chars for
697 * improved interoperability. Only escape space, control
698 * and non-ASCII chars.
699 */
700 escaped = xmlURIEscapeStr(tmp,
701 BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
702 if (escaped != NULL) {
703 xmlOutputBufferWriteQuotedString(buf, escaped);
704 xmlFree(escaped);
705 } else {
706 buf->error = XML_ERR_NO_MEMORY;
707 }
708 } else {
709 xmlOutputBufferWriteQuotedString(buf, value);
710 }
711 xmlFree(value);
712 } else {
713 buf->error = XML_ERR_NO_MEMORY;
714 }
715 }
716 }
717
718 /**
719 * htmlNodeDumpFormatOutput:
720 * @buf: the HTML buffer output
721 * @doc: the document
722 * @cur: the current node
723 * @encoding: the encoding string (unused)
724 * @format: should formatting spaces been added
725 *
726 * Dump an HTML node, recursive behaviour,children are printed too.
727 */
728 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)729 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
730 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
731 int format) {
732 xmlNodePtr root, parent;
733 xmlAttrPtr attr;
734 const htmlElemDesc * info;
735
736 xmlInitParser();
737
738 if ((cur == NULL) || (buf == NULL)) {
739 return;
740 }
741
742 root = cur;
743 parent = cur->parent;
744 while (1) {
745 switch (cur->type) {
746 case XML_HTML_DOCUMENT_NODE:
747 case XML_DOCUMENT_NODE:
748 if (((xmlDocPtr) cur)->intSubset != NULL) {
749 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
750 }
751 if (cur->children != NULL) {
752 /* Always validate cur->parent when descending. */
753 if (cur->parent == parent) {
754 parent = cur;
755 cur = cur->children;
756 continue;
757 }
758 } else {
759 xmlOutputBufferWriteString(buf, "\n");
760 }
761 break;
762
763 case XML_ELEMENT_NODE:
764 /*
765 * Some users like lxml are known to pass nodes with a corrupted
766 * tree structure. Fall back to a recursive call to handle this
767 * case.
768 */
769 if ((cur->parent != parent) && (cur->children != NULL)) {
770 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
771 break;
772 }
773
774 /*
775 * Get specific HTML info for that node.
776 */
777 if (cur->ns == NULL)
778 info = htmlTagLookup(cur->name);
779 else
780 info = NULL;
781
782 xmlOutputBufferWriteString(buf, "<");
783 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
784 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
785 xmlOutputBufferWriteString(buf, ":");
786 }
787 xmlOutputBufferWriteString(buf, (const char *)cur->name);
788 if (cur->nsDef)
789 xmlNsListDumpOutput(buf, cur->nsDef);
790 attr = cur->properties;
791 while (attr != NULL) {
792 htmlAttrDumpOutput(buf, doc, attr);
793 attr = attr->next;
794 }
795
796 if ((info != NULL) && (info->empty)) {
797 xmlOutputBufferWriteString(buf, ">");
798 } else if (cur->children == NULL) {
799 if ((info != NULL) && (info->saveEndTag != 0) &&
800 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
801 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
802 xmlOutputBufferWriteString(buf, ">");
803 } else {
804 xmlOutputBufferWriteString(buf, "></");
805 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
806 xmlOutputBufferWriteString(buf,
807 (const char *)cur->ns->prefix);
808 xmlOutputBufferWriteString(buf, ":");
809 }
810 xmlOutputBufferWriteString(buf, (const char *)cur->name);
811 xmlOutputBufferWriteString(buf, ">");
812 }
813 } else {
814 xmlOutputBufferWriteString(buf, ">");
815 if ((format) && (info != NULL) && (!info->isinline) &&
816 (cur->children->type != HTML_TEXT_NODE) &&
817 (cur->children->type != HTML_ENTITY_REF_NODE) &&
818 (cur->children != cur->last) &&
819 (cur->name != NULL) &&
820 (cur->name[0] != 'p')) /* p, pre, param */
821 xmlOutputBufferWriteString(buf, "\n");
822 parent = cur;
823 cur = cur->children;
824 continue;
825 }
826
827 if ((format) && (cur->next != NULL) &&
828 (info != NULL) && (!info->isinline)) {
829 if ((cur->next->type != HTML_TEXT_NODE) &&
830 (cur->next->type != HTML_ENTITY_REF_NODE) &&
831 (parent != NULL) &&
832 (parent->name != NULL) &&
833 (parent->name[0] != 'p')) /* p, pre, param */
834 xmlOutputBufferWriteString(buf, "\n");
835 }
836
837 break;
838
839 case XML_ATTRIBUTE_NODE:
840 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
841 break;
842
843 case HTML_TEXT_NODE:
844 if (cur->content == NULL)
845 break;
846 if (((cur->name == (const xmlChar *)xmlStringText) ||
847 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
848 ((parent == NULL) ||
849 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
850 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
851 xmlChar *buffer;
852
853 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
854 if (buffer == NULL) {
855 buf->error = XML_ERR_NO_MEMORY;
856 return;
857 }
858 xmlOutputBufferWriteString(buf, (const char *)buffer);
859 xmlFree(buffer);
860 } else {
861 xmlOutputBufferWriteString(buf, (const char *)cur->content);
862 }
863 break;
864
865 case HTML_COMMENT_NODE:
866 if (cur->content != NULL) {
867 xmlOutputBufferWriteString(buf, "<!--");
868 xmlOutputBufferWriteString(buf, (const char *)cur->content);
869 xmlOutputBufferWriteString(buf, "-->");
870 }
871 break;
872
873 case HTML_PI_NODE:
874 if (cur->name != NULL) {
875 xmlOutputBufferWriteString(buf, "<?");
876 xmlOutputBufferWriteString(buf, (const char *)cur->name);
877 if (cur->content != NULL) {
878 xmlOutputBufferWriteString(buf, " ");
879 xmlOutputBufferWriteString(buf,
880 (const char *)cur->content);
881 }
882 xmlOutputBufferWriteString(buf, ">");
883 }
884 break;
885
886 case HTML_ENTITY_REF_NODE:
887 xmlOutputBufferWriteString(buf, "&");
888 xmlOutputBufferWriteString(buf, (const char *)cur->name);
889 xmlOutputBufferWriteString(buf, ";");
890 break;
891
892 case HTML_PRESERVE_NODE:
893 if (cur->content != NULL) {
894 xmlOutputBufferWriteString(buf, (const char *)cur->content);
895 }
896 break;
897
898 default:
899 break;
900 }
901
902 while (1) {
903 if (cur == root)
904 return;
905 if (cur->next != NULL) {
906 cur = cur->next;
907 break;
908 }
909
910 cur = parent;
911 /* cur->parent was validated when descending. */
912 parent = cur->parent;
913
914 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
915 (cur->type == XML_DOCUMENT_NODE)) {
916 xmlOutputBufferWriteString(buf, "\n");
917 } else {
918 if ((format) && (cur->ns == NULL))
919 info = htmlTagLookup(cur->name);
920 else
921 info = NULL;
922
923 if ((format) && (info != NULL) && (!info->isinline) &&
924 (cur->last->type != HTML_TEXT_NODE) &&
925 (cur->last->type != HTML_ENTITY_REF_NODE) &&
926 (cur->children != cur->last) &&
927 (cur->name != NULL) &&
928 (cur->name[0] != 'p')) /* p, pre, param */
929 xmlOutputBufferWriteString(buf, "\n");
930
931 xmlOutputBufferWriteString(buf, "</");
932 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
933 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
934 xmlOutputBufferWriteString(buf, ":");
935 }
936 xmlOutputBufferWriteString(buf, (const char *)cur->name);
937 xmlOutputBufferWriteString(buf, ">");
938
939 if ((format) && (info != NULL) && (!info->isinline) &&
940 (cur->next != NULL)) {
941 if ((cur->next->type != HTML_TEXT_NODE) &&
942 (cur->next->type != HTML_ENTITY_REF_NODE) &&
943 (parent != NULL) &&
944 (parent->name != NULL) &&
945 (parent->name[0] != 'p')) /* p, pre, param */
946 xmlOutputBufferWriteString(buf, "\n");
947 }
948 }
949 }
950 }
951 }
952
953 /**
954 * htmlNodeDumpOutput:
955 * @buf: the HTML buffer output
956 * @doc: the document
957 * @cur: the current node
958 * @encoding: the encoding string (unused)
959 *
960 * Dump an HTML node, recursive behaviour,children are printed too,
961 * and formatting returns/spaces are added.
962 */
963 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)964 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
965 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
966 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
967 }
968
969 /**
970 * htmlDocContentDumpFormatOutput:
971 * @buf: the HTML buffer output
972 * @cur: the document
973 * @encoding: the encoding string (unused)
974 * @format: should formatting spaces been added
975 *
976 * Dump an HTML document.
977 */
978 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)979 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
980 const char *encoding ATTRIBUTE_UNUSED,
981 int format) {
982 int type = 0;
983 if (cur) {
984 type = cur->type;
985 cur->type = XML_HTML_DOCUMENT_NODE;
986 }
987 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
988 if (cur)
989 cur->type = (xmlElementType) type;
990 }
991
992 /**
993 * htmlDocContentDumpOutput:
994 * @buf: the HTML buffer output
995 * @cur: the document
996 * @encoding: the encoding string (unused)
997 *
998 * Dump an HTML document. Formatting return/spaces are added.
999 */
1000 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)1001 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1002 const char *encoding ATTRIBUTE_UNUSED) {
1003 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1004 }
1005
1006 /************************************************************************
1007 * *
1008 * Saving functions front-ends *
1009 * *
1010 ************************************************************************/
1011
1012 /**
1013 * htmlDocDump:
1014 * @f: the FILE*
1015 * @cur: the document
1016 *
1017 * Dump an HTML document to an open FILE.
1018 *
1019 * returns: the number of byte written or -1 in case of failure.
1020 */
1021 int
htmlDocDump(FILE * f,xmlDocPtr cur)1022 htmlDocDump(FILE *f, xmlDocPtr cur) {
1023 xmlOutputBufferPtr buf;
1024 xmlCharEncodingHandlerPtr handler = NULL;
1025 const char *encoding;
1026 int ret;
1027
1028 xmlInitParser();
1029
1030 if ((cur == NULL) || (f == NULL)) {
1031 return(-1);
1032 }
1033
1034 encoding = (const char *) htmlGetMetaEncoding(cur);
1035 handler = htmlFindOutputEncoder(encoding);
1036 buf = xmlOutputBufferCreateFile(f, handler);
1037 if (buf == NULL) {
1038 xmlCharEncCloseFunc(handler);
1039 return(-1);
1040 }
1041 htmlDocContentDumpOutput(buf, cur, NULL);
1042
1043 ret = xmlOutputBufferClose(buf);
1044 return(ret);
1045 }
1046
1047 /**
1048 * htmlSaveFile:
1049 * @filename: the filename (or URL)
1050 * @cur: the document
1051 *
1052 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1053 * used.
1054 * returns: the number of byte written or -1 in case of failure.
1055 */
1056 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1057 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1058 xmlOutputBufferPtr buf;
1059 xmlCharEncodingHandlerPtr handler = NULL;
1060 const char *encoding;
1061 int ret;
1062
1063 if ((cur == NULL) || (filename == NULL))
1064 return(-1);
1065
1066 xmlInitParser();
1067
1068 encoding = (const char *) htmlGetMetaEncoding(cur);
1069 handler = htmlFindOutputEncoder(encoding);
1070 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1071 if (buf == NULL) {
1072 xmlCharEncCloseFunc(handler);
1073 return(0);
1074 }
1075
1076 htmlDocContentDumpOutput(buf, cur, NULL);
1077
1078 ret = xmlOutputBufferClose(buf);
1079 return(ret);
1080 }
1081
1082 /**
1083 * htmlSaveFileFormat:
1084 * @filename: the filename
1085 * @cur: the document
1086 * @format: should formatting spaces been added
1087 * @encoding: the document encoding
1088 *
1089 * Dump an HTML document to a file using a given encoding.
1090 *
1091 * returns: the number of byte written or -1 in case of failure.
1092 */
1093 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1094 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1095 const char *encoding, int format) {
1096 xmlOutputBufferPtr buf;
1097 xmlCharEncodingHandlerPtr handler = NULL;
1098 int ret;
1099
1100 if ((cur == NULL) || (filename == NULL))
1101 return(-1);
1102
1103 xmlInitParser();
1104
1105 handler = htmlFindOutputEncoder(encoding);
1106 if (handler != NULL)
1107 htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1108 else
1109 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1110
1111 /*
1112 * save the content to a temp buffer.
1113 */
1114 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1115 if (buf == NULL) {
1116 xmlCharEncCloseFunc(handler);
1117 return(0);
1118 }
1119
1120 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1121
1122 ret = xmlOutputBufferClose(buf);
1123 return(ret);
1124 }
1125
1126 /**
1127 * htmlSaveFileEnc:
1128 * @filename: the filename
1129 * @cur: the document
1130 * @encoding: the document encoding
1131 *
1132 * Dump an HTML document to a file using a given encoding
1133 * and formatting returns/spaces are added.
1134 *
1135 * returns: the number of byte written or -1 in case of failure.
1136 */
1137 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1138 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1139 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1140 }
1141
1142 #endif /* LIBXML_OUTPUT_ENABLED */
1143
1144 #endif /* LIBXML_HTML_ENABLED */
1145