1 /*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/parser.h"
30 #include "private/save.h"
31
32 /************************************************************************
33 * *
34 * Getting/Setting encoding meta tags *
35 * *
36 ************************************************************************/
37
38 /**
39 * htmlGetMetaEncoding:
40 * @doc: the document
41 *
42 * Encoding definition lookup in the Meta tags
43 *
44 * Returns the current encoding as flagged in the HTML source
45 */
46 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)47 htmlGetMetaEncoding(htmlDocPtr doc) {
48 htmlNodePtr cur;
49 const xmlChar *content;
50 const xmlChar *encoding;
51
52 if (doc == NULL)
53 return(NULL);
54 cur = doc->children;
55
56 /*
57 * Search the html
58 */
59 while (cur != NULL) {
60 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
61 if (xmlStrEqual(cur->name, BAD_CAST"html"))
62 break;
63 if (xmlStrEqual(cur->name, BAD_CAST"head"))
64 goto found_head;
65 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
66 goto found_meta;
67 }
68 cur = cur->next;
69 }
70 if (cur == NULL)
71 return(NULL);
72 cur = cur->children;
73
74 /*
75 * Search the head
76 */
77 while (cur != NULL) {
78 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
79 if (xmlStrEqual(cur->name, BAD_CAST"head"))
80 break;
81 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
82 goto found_meta;
83 }
84 cur = cur->next;
85 }
86 if (cur == NULL)
87 return(NULL);
88 found_head:
89 cur = cur->children;
90
91 /*
92 * Search the meta elements
93 */
94 found_meta:
95 while (cur != NULL) {
96 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
97 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
98 xmlAttrPtr attr = cur->properties;
99 int http;
100 const xmlChar *value;
101
102 content = NULL;
103 http = 0;
104 while (attr != NULL) {
105 if ((attr->children != NULL) &&
106 (attr->children->type == XML_TEXT_NODE) &&
107 (attr->children->next == NULL)) {
108 value = attr->children->content;
109 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
110 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
111 http = 1;
112 else if ((value != NULL)
113 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
114 content = value;
115 if ((http != 0) && (content != NULL))
116 goto found_content;
117 }
118 attr = attr->next;
119 }
120 }
121 }
122 cur = cur->next;
123 }
124 return(NULL);
125
126 found_content:
127 encoding = xmlStrstr(content, BAD_CAST"charset=");
128 if (encoding == NULL)
129 encoding = xmlStrstr(content, BAD_CAST"Charset=");
130 if (encoding == NULL)
131 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
132 if (encoding != NULL) {
133 encoding += 8;
134 } else {
135 encoding = xmlStrstr(content, BAD_CAST"charset =");
136 if (encoding == NULL)
137 encoding = xmlStrstr(content, BAD_CAST"Charset =");
138 if (encoding == NULL)
139 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
140 if (encoding != NULL)
141 encoding += 9;
142 }
143 if (encoding != NULL) {
144 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
145 }
146 return(encoding);
147 }
148
149 /**
150 * htmlSetMetaEncoding:
151 * @doc: the document
152 * @encoding: the encoding string
153 *
154 * Sets the current encoding in the Meta tags
155 * NOTE: this will not change the document content encoding, just
156 * the META flag associated.
157 *
158 * Returns 0 in case of success and -1 in case of error
159 */
160 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)161 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
162 htmlNodePtr cur, meta = NULL, head = NULL;
163 const xmlChar *content = NULL;
164 char newcontent[100];
165
166 newcontent[0] = 0;
167
168 if (doc == NULL)
169 return(-1);
170
171 /* html isn't a real encoding it's just libxml2 way to get entities */
172 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
173 return(-1);
174
175 if (encoding != NULL) {
176 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
177 (char *)encoding);
178 newcontent[sizeof(newcontent) - 1] = 0;
179 }
180
181 cur = doc->children;
182
183 /*
184 * Search the html
185 */
186 while (cur != NULL) {
187 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
188 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
189 break;
190 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
191 goto found_head;
192 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
193 goto found_meta;
194 }
195 cur = cur->next;
196 }
197 if (cur == NULL)
198 return(-1);
199 cur = cur->children;
200
201 /*
202 * Search the head
203 */
204 while (cur != NULL) {
205 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
206 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
207 break;
208 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
209 head = cur->parent;
210 goto found_meta;
211 }
212 }
213 cur = cur->next;
214 }
215 if (cur == NULL)
216 return(-1);
217 found_head:
218 head = cur;
219 if (cur->children == NULL)
220 goto create;
221 cur = cur->children;
222
223 found_meta:
224 /*
225 * Search and update all the remaining the meta elements carrying
226 * encoding information
227 */
228 while (cur != NULL) {
229 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
230 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
231 xmlAttrPtr attr = cur->properties;
232 int http;
233 const xmlChar *value;
234
235 content = NULL;
236 http = 0;
237 while (attr != NULL) {
238 if ((attr->children != NULL) &&
239 (attr->children->type == XML_TEXT_NODE) &&
240 (attr->children->next == NULL)) {
241 value = attr->children->content;
242 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
243 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
244 http = 1;
245 else
246 {
247 if ((value != NULL) &&
248 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
249 content = value;
250 }
251 if ((http != 0) && (content != NULL))
252 break;
253 }
254 attr = attr->next;
255 }
256 if ((http != 0) && (content != NULL)) {
257 meta = cur;
258 break;
259 }
260
261 }
262 }
263 cur = cur->next;
264 }
265 create:
266 if (meta == NULL) {
267 if ((encoding != NULL) && (head != NULL)) {
268 /*
269 * Create a new Meta element with the right attributes
270 */
271
272 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
273 if (head->children == NULL)
274 xmlAddChild(head, meta);
275 else
276 xmlAddPrevSibling(head->children, meta);
277 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
278 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
279 }
280 } else {
281 /* remove the meta tag if NULL is passed */
282 if (encoding == NULL) {
283 xmlUnlinkNode(meta);
284 xmlFreeNode(meta);
285 }
286 /* change the document only if there is a real encoding change */
287 else if (xmlStrcasestr(content, encoding) == NULL) {
288 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
289 }
290 }
291
292
293 return(0);
294 }
295
296 /**
297 * booleanHTMLAttrs:
298 *
299 * These are the HTML attributes which will be output
300 * in minimized form, i.e. <option selected="selected"> will be
301 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
302 *
303 */
304 static const char* const htmlBooleanAttrs[] = {
305 "checked", "compact", "declare", "defer", "disabled", "ismap",
306 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
307 "selected", NULL
308 };
309
310
311 /**
312 * htmlIsBooleanAttr:
313 * @name: the name of the attribute to check
314 *
315 * DEPRECATED: Internal function, don't use.
316 *
317 * Determine if a given attribute is a boolean attribute.
318 *
319 * returns: false if the attribute is not boolean, true otherwise.
320 */
321 int
htmlIsBooleanAttr(const xmlChar * name)322 htmlIsBooleanAttr(const xmlChar *name)
323 {
324 int i = 0;
325
326 while (htmlBooleanAttrs[i] != NULL) {
327 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
328 return 1;
329 i++;
330 }
331 return 0;
332 }
333
334 #ifdef LIBXML_OUTPUT_ENABLED
335 /************************************************************************
336 * *
337 * Output error handlers *
338 * *
339 ************************************************************************/
340
341 /**
342 * htmlSaveErr:
343 * @code: the error number
344 * @node: the location of the error.
345 * @extra: extra information
346 *
347 * Handle an out of memory condition
348 */
349 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)350 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
351 {
352 const char *msg = NULL;
353 int res;
354
355 switch(code) {
356 case XML_SAVE_NOT_UTF8:
357 msg = "string is not in UTF-8\n";
358 break;
359 case XML_SAVE_CHAR_INVALID:
360 msg = "invalid character value\n";
361 break;
362 case XML_SAVE_UNKNOWN_ENCODING:
363 msg = "unknown encoding %s\n";
364 break;
365 case XML_SAVE_NO_DOCTYPE:
366 msg = "HTML has no DOCTYPE\n";
367 break;
368 default:
369 msg = "unexpected error number\n";
370 }
371
372 res = xmlRaiseError(NULL, NULL, NULL, NULL, node,
373 XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
374 extra, NULL, NULL, 0, 0,
375 msg, extra);
376 if (res < 0)
377 xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
378 }
379
380 /************************************************************************
381 * *
382 * Dumping HTML tree content to a simple buffer *
383 * *
384 ************************************************************************/
385
386 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)387 htmlFindOutputEncoder(const char *encoding) {
388 xmlCharEncodingHandler *handler = NULL;
389
390 if (encoding != NULL) {
391 int res;
392
393 res = xmlOpenCharEncodingHandler(encoding, /* output */ 1,
394 &handler);
395 if (res != XML_ERR_OK)
396 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
397 } else {
398 /*
399 * Fallback to HTML when the encoding is unspecified
400 */
401 xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
402 }
403
404 return(handler);
405 }
406
407 /**
408 * htmlBufNodeDumpFormat:
409 * @buf: the xmlBufPtr output
410 * @doc: the document
411 * @cur: the current node
412 * @format: should formatting spaces been added
413 *
414 * Dump an HTML node, recursive behaviour,children are printed too.
415 *
416 * Returns the number of byte written or -1 in case of error
417 */
418 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)419 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
420 int format) {
421 size_t use;
422 size_t ret;
423 xmlOutputBufferPtr outbuf;
424
425 if (cur == NULL) {
426 return ((size_t) -1);
427 }
428 if (buf == NULL) {
429 return ((size_t) -1);
430 }
431 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
432 if (outbuf == NULL)
433 return ((size_t) -1);
434 memset(outbuf, 0, sizeof(xmlOutputBuffer));
435 outbuf->buffer = buf;
436 outbuf->encoder = NULL;
437 outbuf->writecallback = NULL;
438 outbuf->closecallback = NULL;
439 outbuf->context = NULL;
440 outbuf->written = 0;
441
442 use = xmlBufUse(buf);
443 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
444 if (outbuf->error)
445 ret = (size_t) -1;
446 else
447 ret = xmlBufUse(buf) - use;
448 xmlFree(outbuf);
449 return (ret);
450 }
451
452 /**
453 * htmlNodeDump:
454 * @buf: the HTML buffer output
455 * @doc: the document
456 * @cur: the current node
457 *
458 * Dump an HTML node, recursive behaviour,children are printed too,
459 * and formatting returns are added.
460 *
461 * Returns the number of byte written or -1 in case of error
462 */
463 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)464 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
465 xmlBufPtr buffer;
466 size_t ret1;
467 int ret2;
468
469 if ((buf == NULL) || (cur == NULL))
470 return(-1);
471
472 xmlInitParser();
473 buffer = xmlBufFromBuffer(buf);
474 if (buffer == NULL)
475 return(-1);
476
477 ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
478
479 ret2 = xmlBufBackToBuffer(buffer, buf);
480
481 if ((ret1 == (size_t) -1) || (ret2 < 0))
482 return(-1);
483 return(ret1 > INT_MAX ? INT_MAX : ret1);
484 }
485
486 /**
487 * htmlNodeDumpFileFormat:
488 * @out: the FILE pointer
489 * @doc: the document
490 * @cur: the current node
491 * @encoding: the document encoding
492 * @format: should formatting spaces been added
493 *
494 * Dump an HTML node, recursive behaviour,children are printed too.
495 *
496 * TODO: if encoding == NULL try to save in the doc encoding
497 *
498 * returns: the number of byte written or -1 in case of failure.
499 */
500 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)501 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
502 xmlNodePtr cur, const char *encoding, int format) {
503 xmlOutputBufferPtr buf;
504 xmlCharEncodingHandlerPtr handler;
505 int ret;
506
507 xmlInitParser();
508
509 /*
510 * save the content to a temp buffer.
511 */
512 handler = htmlFindOutputEncoder(encoding);
513 buf = xmlOutputBufferCreateFile(out, handler);
514 if (buf == NULL)
515 return(0);
516
517 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
518
519 ret = xmlOutputBufferClose(buf);
520 return(ret);
521 }
522
523 /**
524 * htmlNodeDumpFile:
525 * @out: the FILE pointer
526 * @doc: the document
527 * @cur: the current node
528 *
529 * Dump an HTML node, recursive behaviour,children are printed too,
530 * and formatting returns are added.
531 */
532 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)533 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
534 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
535 }
536
537 /**
538 * htmlDocDumpMemoryFormat:
539 * @cur: the document
540 * @mem: OUT: the memory pointer
541 * @size: OUT: the memory length
542 * @format: should formatting spaces been added
543 *
544 * Dump an HTML document in memory and return the xmlChar * and it's size.
545 * It's up to the caller to free the memory.
546 */
547 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)548 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
549 xmlOutputBufferPtr buf;
550 xmlCharEncodingHandlerPtr handler = NULL;
551 const char *encoding;
552
553 xmlInitParser();
554
555 if ((mem == NULL) || (size == NULL))
556 return;
557 *mem = NULL;
558 *size = 0;
559 if (cur == NULL)
560 return;
561
562 encoding = (const char *) htmlGetMetaEncoding(cur);
563 handler = htmlFindOutputEncoder(encoding);
564 buf = xmlAllocOutputBuffer(handler);
565 if (buf == NULL)
566 return;
567
568 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
569
570 xmlOutputBufferFlush(buf);
571
572 if (!buf->error) {
573 if (buf->conv != NULL) {
574 *size = xmlBufUse(buf->conv);
575 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
576 } else {
577 *size = xmlBufUse(buf->buffer);
578 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
579 }
580 }
581
582 xmlOutputBufferClose(buf);
583 }
584
585 /**
586 * htmlDocDumpMemory:
587 * @cur: the document
588 * @mem: OUT: the memory pointer
589 * @size: OUT: the memory length
590 *
591 * Dump an HTML document in memory and return the xmlChar * and it's size.
592 * It's up to the caller to free the memory.
593 */
594 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)595 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
596 htmlDocDumpMemoryFormat(cur, mem, size, 1);
597 }
598
599
600 /************************************************************************
601 * *
602 * Dumping HTML tree content to an I/O output buffer *
603 * *
604 ************************************************************************/
605
606 /**
607 * htmlDtdDumpOutput:
608 * @buf: the HTML buffer output
609 * @doc: the document
610 * @encoding: the encoding string
611 *
612 * TODO: check whether encoding is needed
613 *
614 * Dump the HTML document DTD, if any.
615 */
616 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)617 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
618 const char *encoding ATTRIBUTE_UNUSED) {
619 xmlDtdPtr cur = doc->intSubset;
620
621 if (cur == NULL) {
622 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
623 return;
624 }
625 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
626 xmlOutputBufferWriteString(buf, (const char *)cur->name);
627 if (cur->ExternalID != NULL) {
628 xmlOutputBufferWriteString(buf, " PUBLIC ");
629 xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
630 if (cur->SystemID != NULL) {
631 xmlOutputBufferWriteString(buf, " ");
632 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
633 }
634 } else if (cur->SystemID != NULL &&
635 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
636 xmlOutputBufferWriteString(buf, " SYSTEM ");
637 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
638 }
639 xmlOutputBufferWriteString(buf, ">\n");
640 }
641
642 /**
643 * htmlAttrDumpOutput:
644 * @buf: the HTML buffer output
645 * @doc: the document
646 * @cur: the attribute pointer
647 *
648 * Dump an HTML attribute
649 */
650 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)651 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
652 xmlChar *value;
653
654 /*
655 * The html output method should not escape a & character
656 * occurring in an attribute value immediately followed by
657 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
658 * This is implemented in xmlEncodeEntitiesReentrant
659 */
660
661 if (cur == NULL) {
662 return;
663 }
664 xmlOutputBufferWriteString(buf, " ");
665 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
666 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
667 xmlOutputBufferWriteString(buf, ":");
668 }
669 xmlOutputBufferWriteString(buf, (const char *)cur->name);
670 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
671 value = xmlNodeListGetString(doc, cur->children, 0);
672 if (value) {
673 xmlOutputBufferWriteString(buf, "=");
674 if ((cur->ns == NULL) && (cur->parent != NULL) &&
675 (cur->parent->ns == NULL) &&
676 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
677 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
678 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
679 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
680 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
681 xmlChar *escaped;
682 xmlChar *tmp = value;
683
684 while (IS_BLANK_CH(*tmp)) tmp++;
685
686 /*
687 * Angle brackets are technically illegal in URIs, but they're
688 * used in server side includes, for example. Curly brackets
689 * are illegal as well and often used in templates.
690 * Don't escape non-whitespace, printable ASCII chars for
691 * improved interoperability. Only escape space, control
692 * and non-ASCII chars.
693 */
694 escaped = xmlURIEscapeStr(tmp,
695 BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
696 if (escaped != NULL) {
697 xmlOutputBufferWriteQuotedString(buf, escaped);
698 xmlFree(escaped);
699 } else {
700 buf->error = XML_ERR_NO_MEMORY;
701 }
702 } else {
703 xmlOutputBufferWriteQuotedString(buf, value);
704 }
705 xmlFree(value);
706 } else {
707 buf->error = XML_ERR_NO_MEMORY;
708 }
709 }
710 }
711
712 /**
713 * htmlNodeDumpFormatOutput:
714 * @buf: the HTML buffer output
715 * @doc: the document
716 * @cur: the current node
717 * @encoding: the encoding string (unused)
718 * @format: should formatting spaces been added
719 *
720 * Dump an HTML node, recursive behaviour,children are printed too.
721 */
722 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)723 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
724 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
725 int format) {
726 xmlNodePtr root, parent;
727 xmlAttrPtr attr;
728 const htmlElemDesc * info;
729
730 xmlInitParser();
731
732 if ((cur == NULL) || (buf == NULL)) {
733 return;
734 }
735
736 root = cur;
737 parent = cur->parent;
738 while (1) {
739 switch (cur->type) {
740 case XML_HTML_DOCUMENT_NODE:
741 case XML_DOCUMENT_NODE:
742 if (((xmlDocPtr) cur)->intSubset != NULL) {
743 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
744 }
745 if (cur->children != NULL) {
746 /* Always validate cur->parent when descending. */
747 if (cur->parent == parent) {
748 parent = cur;
749 cur = cur->children;
750 continue;
751 }
752 } else {
753 xmlOutputBufferWriteString(buf, "\n");
754 }
755 break;
756
757 case XML_ELEMENT_NODE:
758 /*
759 * Some users like lxml are known to pass nodes with a corrupted
760 * tree structure. Fall back to a recursive call to handle this
761 * case.
762 */
763 if ((cur->parent != parent) && (cur->children != NULL)) {
764 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
765 break;
766 }
767
768 /*
769 * Get specific HTML info for that node.
770 */
771 if (cur->ns == NULL)
772 info = htmlTagLookup(cur->name);
773 else
774 info = NULL;
775
776 xmlOutputBufferWriteString(buf, "<");
777 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
778 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
779 xmlOutputBufferWriteString(buf, ":");
780 }
781 xmlOutputBufferWriteString(buf, (const char *)cur->name);
782 if (cur->nsDef)
783 xmlNsListDumpOutput(buf, cur->nsDef);
784 attr = cur->properties;
785 while (attr != NULL) {
786 htmlAttrDumpOutput(buf, doc, attr);
787 attr = attr->next;
788 }
789
790 if ((info != NULL) && (info->empty)) {
791 xmlOutputBufferWriteString(buf, ">");
792 } else if (cur->children == NULL) {
793 if ((info != NULL) && (info->saveEndTag != 0) &&
794 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
795 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
796 xmlOutputBufferWriteString(buf, ">");
797 } else {
798 xmlOutputBufferWriteString(buf, "></");
799 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
800 xmlOutputBufferWriteString(buf,
801 (const char *)cur->ns->prefix);
802 xmlOutputBufferWriteString(buf, ":");
803 }
804 xmlOutputBufferWriteString(buf, (const char *)cur->name);
805 xmlOutputBufferWriteString(buf, ">");
806 }
807 } else {
808 xmlOutputBufferWriteString(buf, ">");
809 if ((format) && (info != NULL) && (!info->isinline) &&
810 (cur->children->type != HTML_TEXT_NODE) &&
811 (cur->children->type != HTML_ENTITY_REF_NODE) &&
812 (cur->children != cur->last) &&
813 (cur->name != NULL) &&
814 (cur->name[0] != 'p')) /* p, pre, param */
815 xmlOutputBufferWriteString(buf, "\n");
816 parent = cur;
817 cur = cur->children;
818 continue;
819 }
820
821 if ((format) && (cur->next != NULL) &&
822 (info != NULL) && (!info->isinline)) {
823 if ((cur->next->type != HTML_TEXT_NODE) &&
824 (cur->next->type != HTML_ENTITY_REF_NODE) &&
825 (parent != NULL) &&
826 (parent->name != NULL) &&
827 (parent->name[0] != 'p')) /* p, pre, param */
828 xmlOutputBufferWriteString(buf, "\n");
829 }
830
831 break;
832
833 case XML_ATTRIBUTE_NODE:
834 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
835 break;
836
837 case HTML_TEXT_NODE:
838 if (cur->content == NULL)
839 break;
840 if (((cur->name == (const xmlChar *)xmlStringText) ||
841 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
842 ((parent == NULL) ||
843 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
844 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
845 xmlChar *buffer;
846
847 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
848 if (buffer == NULL) {
849 buf->error = XML_ERR_NO_MEMORY;
850 return;
851 }
852 xmlOutputBufferWriteString(buf, (const char *)buffer);
853 xmlFree(buffer);
854 } else {
855 xmlOutputBufferWriteString(buf, (const char *)cur->content);
856 }
857 break;
858
859 case HTML_COMMENT_NODE:
860 if (cur->content != NULL) {
861 xmlOutputBufferWriteString(buf, "<!--");
862 xmlOutputBufferWriteString(buf, (const char *)cur->content);
863 xmlOutputBufferWriteString(buf, "-->");
864 }
865 break;
866
867 case HTML_PI_NODE:
868 if (cur->name != NULL) {
869 xmlOutputBufferWriteString(buf, "<?");
870 xmlOutputBufferWriteString(buf, (const char *)cur->name);
871 if (cur->content != NULL) {
872 xmlOutputBufferWriteString(buf, " ");
873 xmlOutputBufferWriteString(buf,
874 (const char *)cur->content);
875 }
876 xmlOutputBufferWriteString(buf, ">");
877 }
878 break;
879
880 case HTML_ENTITY_REF_NODE:
881 xmlOutputBufferWriteString(buf, "&");
882 xmlOutputBufferWriteString(buf, (const char *)cur->name);
883 xmlOutputBufferWriteString(buf, ";");
884 break;
885
886 case HTML_PRESERVE_NODE:
887 if (cur->content != NULL) {
888 xmlOutputBufferWriteString(buf, (const char *)cur->content);
889 }
890 break;
891
892 default:
893 break;
894 }
895
896 while (1) {
897 if (cur == root)
898 return;
899 if (cur->next != NULL) {
900 cur = cur->next;
901 break;
902 }
903
904 cur = parent;
905 /* cur->parent was validated when descending. */
906 parent = cur->parent;
907
908 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
909 (cur->type == XML_DOCUMENT_NODE)) {
910 xmlOutputBufferWriteString(buf, "\n");
911 } else {
912 if ((format) && (cur->ns == NULL))
913 info = htmlTagLookup(cur->name);
914 else
915 info = NULL;
916
917 if ((format) && (info != NULL) && (!info->isinline) &&
918 (cur->last->type != HTML_TEXT_NODE) &&
919 (cur->last->type != HTML_ENTITY_REF_NODE) &&
920 (cur->children != cur->last) &&
921 (cur->name != NULL) &&
922 (cur->name[0] != 'p')) /* p, pre, param */
923 xmlOutputBufferWriteString(buf, "\n");
924
925 xmlOutputBufferWriteString(buf, "</");
926 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
927 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
928 xmlOutputBufferWriteString(buf, ":");
929 }
930 xmlOutputBufferWriteString(buf, (const char *)cur->name);
931 xmlOutputBufferWriteString(buf, ">");
932
933 if ((format) && (info != NULL) && (!info->isinline) &&
934 (cur->next != NULL)) {
935 if ((cur->next->type != HTML_TEXT_NODE) &&
936 (cur->next->type != HTML_ENTITY_REF_NODE) &&
937 (parent != NULL) &&
938 (parent->name != NULL) &&
939 (parent->name[0] != 'p')) /* p, pre, param */
940 xmlOutputBufferWriteString(buf, "\n");
941 }
942 }
943 }
944 }
945 }
946
947 /**
948 * htmlNodeDumpOutput:
949 * @buf: the HTML buffer output
950 * @doc: the document
951 * @cur: the current node
952 * @encoding: the encoding string (unused)
953 *
954 * Dump an HTML node, recursive behaviour,children are printed too,
955 * and formatting returns/spaces are added.
956 */
957 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)958 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
959 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
960 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
961 }
962
963 /**
964 * htmlDocContentDumpFormatOutput:
965 * @buf: the HTML buffer output
966 * @cur: the document
967 * @encoding: the encoding string (unused)
968 * @format: should formatting spaces been added
969 *
970 * Dump an HTML document.
971 */
972 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)973 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
974 const char *encoding ATTRIBUTE_UNUSED,
975 int format) {
976 int type = 0;
977 if (cur) {
978 type = cur->type;
979 cur->type = XML_HTML_DOCUMENT_NODE;
980 }
981 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
982 if (cur)
983 cur->type = (xmlElementType) type;
984 }
985
986 /**
987 * htmlDocContentDumpOutput:
988 * @buf: the HTML buffer output
989 * @cur: the document
990 * @encoding: the encoding string (unused)
991 *
992 * Dump an HTML document. Formatting return/spaces are added.
993 */
994 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)995 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
996 const char *encoding ATTRIBUTE_UNUSED) {
997 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
998 }
999
1000 /************************************************************************
1001 * *
1002 * Saving functions front-ends *
1003 * *
1004 ************************************************************************/
1005
1006 /**
1007 * htmlDocDump:
1008 * @f: the FILE*
1009 * @cur: the document
1010 *
1011 * Dump an HTML document to an open FILE.
1012 *
1013 * returns: the number of byte written or -1 in case of failure.
1014 */
1015 int
htmlDocDump(FILE * f,xmlDocPtr cur)1016 htmlDocDump(FILE *f, xmlDocPtr cur) {
1017 xmlOutputBufferPtr buf;
1018 xmlCharEncodingHandlerPtr handler = NULL;
1019 const char *encoding;
1020 int ret;
1021
1022 xmlInitParser();
1023
1024 if ((cur == NULL) || (f == NULL)) {
1025 return(-1);
1026 }
1027
1028 encoding = (const char *) htmlGetMetaEncoding(cur);
1029 handler = htmlFindOutputEncoder(encoding);
1030 buf = xmlOutputBufferCreateFile(f, handler);
1031 if (buf == NULL)
1032 return(-1);
1033 htmlDocContentDumpOutput(buf, cur, NULL);
1034
1035 ret = xmlOutputBufferClose(buf);
1036 return(ret);
1037 }
1038
1039 /**
1040 * htmlSaveFile:
1041 * @filename: the filename (or URL)
1042 * @cur: the document
1043 *
1044 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1045 * used.
1046 * returns: the number of byte written or -1 in case of failure.
1047 */
1048 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1049 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1050 xmlOutputBufferPtr buf;
1051 xmlCharEncodingHandlerPtr handler = NULL;
1052 const char *encoding;
1053 int ret;
1054
1055 if ((cur == NULL) || (filename == NULL))
1056 return(-1);
1057
1058 xmlInitParser();
1059
1060 encoding = (const char *) htmlGetMetaEncoding(cur);
1061 handler = htmlFindOutputEncoder(encoding);
1062 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1063 if (buf == NULL)
1064 return(0);
1065
1066 htmlDocContentDumpOutput(buf, cur, NULL);
1067
1068 ret = xmlOutputBufferClose(buf);
1069 return(ret);
1070 }
1071
1072 /**
1073 * htmlSaveFileFormat:
1074 * @filename: the filename
1075 * @cur: the document
1076 * @format: should formatting spaces been added
1077 * @encoding: the document encoding
1078 *
1079 * Dump an HTML document to a file using a given encoding.
1080 *
1081 * returns: the number of byte written or -1 in case of failure.
1082 */
1083 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1084 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1085 const char *encoding, int format) {
1086 xmlOutputBufferPtr buf;
1087 xmlCharEncodingHandlerPtr handler = NULL;
1088 int ret;
1089
1090 if ((cur == NULL) || (filename == NULL))
1091 return(-1);
1092
1093 xmlInitParser();
1094
1095 handler = htmlFindOutputEncoder(encoding);
1096 if (handler != NULL)
1097 htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1098 else
1099 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1100
1101 /*
1102 * save the content to a temp buffer.
1103 */
1104 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1105 if (buf == NULL)
1106 return(0);
1107
1108 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1109
1110 ret = xmlOutputBufferClose(buf);
1111 return(ret);
1112 }
1113
1114 /**
1115 * htmlSaveFileEnc:
1116 * @filename: the filename
1117 * @cur: the document
1118 * @encoding: the document encoding
1119 *
1120 * Dump an HTML document to a file using a given encoding
1121 * and formatting returns/spaces are added.
1122 *
1123 * returns: the number of byte written or -1 in case of failure.
1124 */
1125 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1126 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1127 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1128 }
1129
1130 #endif /* LIBXML_OUTPUT_ENABLED */
1131
1132 #endif /* LIBXML_HTML_ENABLED */
1133