1 /*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13
14 #include <string.h> /* for memset() only ! */
15
16 #ifdef HAVE_CTYPE_H
17 #include <ctype.h>
18 #endif
19 #ifdef HAVE_STDLIB_H
20 #include <stdlib.h>
21 #endif
22
23 #include <libxml/xmlmemory.h>
24 #include <libxml/HTMLparser.h>
25 #include <libxml/HTMLtree.h>
26 #include <libxml/entities.h>
27 #include <libxml/valid.h>
28 #include <libxml/xmlerror.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/globals.h>
31 #include <libxml/uri.h>
32
33 /************************************************************************
34 * *
35 * Getting/Setting encoding meta tags *
36 * *
37 ************************************************************************/
38
39 /**
40 * htmlGetMetaEncoding:
41 * @doc: the document
42 *
43 * Encoding definition lookup in the Meta tags
44 *
45 * Returns the current encoding as flagged in the HTML source
46 */
47 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)48 htmlGetMetaEncoding(htmlDocPtr doc) {
49 htmlNodePtr cur;
50 const xmlChar *content;
51 const xmlChar *encoding;
52
53 if (doc == NULL)
54 return(NULL);
55 cur = doc->children;
56
57 /*
58 * Search the html
59 */
60 while (cur != NULL) {
61 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62 if (xmlStrEqual(cur->name, BAD_CAST"html"))
63 break;
64 if (xmlStrEqual(cur->name, BAD_CAST"head"))
65 goto found_head;
66 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67 goto found_meta;
68 }
69 cur = cur->next;
70 }
71 if (cur == NULL)
72 return(NULL);
73 cur = cur->children;
74
75 /*
76 * Search the head
77 */
78 while (cur != NULL) {
79 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80 if (xmlStrEqual(cur->name, BAD_CAST"head"))
81 break;
82 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83 goto found_meta;
84 }
85 cur = cur->next;
86 }
87 if (cur == NULL)
88 return(NULL);
89 found_head:
90 cur = cur->children;
91
92 /*
93 * Search the meta elements
94 */
95 found_meta:
96 while (cur != NULL) {
97 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99 xmlAttrPtr attr = cur->properties;
100 int http;
101 const xmlChar *value;
102
103 content = NULL;
104 http = 0;
105 while (attr != NULL) {
106 if ((attr->children != NULL) &&
107 (attr->children->type == XML_TEXT_NODE) &&
108 (attr->children->next == NULL)) {
109 value = attr->children->content;
110 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112 http = 1;
113 else if ((value != NULL)
114 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115 content = value;
116 if ((http != 0) && (content != NULL))
117 goto found_content;
118 }
119 attr = attr->next;
120 }
121 }
122 }
123 cur = cur->next;
124 }
125 return(NULL);
126
127 found_content:
128 encoding = xmlStrstr(content, BAD_CAST"charset=");
129 if (encoding == NULL)
130 encoding = xmlStrstr(content, BAD_CAST"Charset=");
131 if (encoding == NULL)
132 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133 if (encoding != NULL) {
134 encoding += 8;
135 } else {
136 encoding = xmlStrstr(content, BAD_CAST"charset =");
137 if (encoding == NULL)
138 encoding = xmlStrstr(content, BAD_CAST"Charset =");
139 if (encoding == NULL)
140 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141 if (encoding != NULL)
142 encoding += 9;
143 }
144 if (encoding != NULL) {
145 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146 }
147 return(encoding);
148 }
149
150 /**
151 * htmlSetMetaEncoding:
152 * @doc: the document
153 * @encoding: the encoding string
154 *
155 * Sets the current encoding in the Meta tags
156 * NOTE: this will not change the document content encoding, just
157 * the META flag associated.
158 *
159 * Returns 0 in case of success and -1 in case of error
160 */
161 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)162 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163 htmlNodePtr cur, meta;
164 const xmlChar *content;
165 char newcontent[100];
166
167
168 if (doc == NULL)
169 return(-1);
170
171 if (encoding != NULL) {
172 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
173 (char *)encoding);
174 newcontent[sizeof(newcontent) - 1] = 0;
175 }
176
177 cur = doc->children;
178
179 /*
180 * Search the html
181 */
182 while (cur != NULL) {
183 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
184 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
185 break;
186 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
187 goto found_head;
188 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
189 goto found_meta;
190 }
191 cur = cur->next;
192 }
193 if (cur == NULL)
194 return(-1);
195 cur = cur->children;
196
197 /*
198 * Search the head
199 */
200 while (cur != NULL) {
201 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
202 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
203 break;
204 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
205 goto found_meta;
206 }
207 cur = cur->next;
208 }
209 if (cur == NULL)
210 return(-1);
211 found_head:
212 if (cur->children == NULL) {
213 if (encoding == NULL)
214 return(0);
215 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
216 xmlAddChild(cur, meta);
217 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
218 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
219 return(0);
220 }
221 cur = cur->children;
222
223 found_meta:
224 if (encoding != NULL) {
225 /*
226 * Create a new Meta element with the right attributes
227 */
228
229 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
230 xmlAddPrevSibling(cur, meta);
231 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
232 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
233 }
234
235 /*
236 * Search and destroy all the remaining the meta elements carrying
237 * encoding informations
238 */
239 while (cur != NULL) {
240 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
241 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
242 xmlAttrPtr attr = cur->properties;
243 int http;
244 const xmlChar *value;
245
246 content = NULL;
247 http = 0;
248 while (attr != NULL) {
249 if ((attr->children != NULL) &&
250 (attr->children->type == XML_TEXT_NODE) &&
251 (attr->children->next == NULL)) {
252 value = attr->children->content;
253 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
254 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
255 http = 1;
256 else
257 {
258 if ((value != NULL) &&
259 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
260 content = value;
261 }
262 if ((http != 0) && (content != NULL))
263 break;
264 }
265 attr = attr->next;
266 }
267 if ((http != 0) && (content != NULL)) {
268 meta = cur;
269 cur = cur->next;
270 xmlUnlinkNode(meta);
271 xmlFreeNode(meta);
272 continue;
273 }
274
275 }
276 }
277 cur = cur->next;
278 }
279 return(0);
280 }
281
282 /**
283 * booleanHTMLAttrs:
284 *
285 * These are the HTML attributes which will be output
286 * in minimized form, i.e. <option selected="selected"> will be
287 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
288 *
289 */
290 static const char* htmlBooleanAttrs[] = {
291 "checked", "compact", "declare", "defer", "disabled", "ismap",
292 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
293 "selected", NULL
294 };
295
296
297 /**
298 * htmlIsBooleanAttr:
299 * @name: the name of the attribute to check
300 *
301 * Determine if a given attribute is a boolean attribute.
302 *
303 * returns: false if the attribute is not boolean, true otherwise.
304 */
305 int
htmlIsBooleanAttr(const xmlChar * name)306 htmlIsBooleanAttr(const xmlChar *name)
307 {
308 int i = 0;
309
310 while (htmlBooleanAttrs[i] != NULL) {
311 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
312 return 1;
313 i++;
314 }
315 return 0;
316 }
317
318 #ifdef LIBXML_OUTPUT_ENABLED
319 /*
320 * private routine exported from xmlIO.c
321 */
322 xmlOutputBufferPtr
323 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
324 /************************************************************************
325 * *
326 * Output error handlers *
327 * *
328 ************************************************************************/
329 /**
330 * htmlSaveErrMemory:
331 * @extra: extra informations
332 *
333 * Handle an out of memory condition
334 */
335 static void
htmlSaveErrMemory(const char * extra)336 htmlSaveErrMemory(const char *extra)
337 {
338 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
339 }
340
341 /**
342 * htmlSaveErr:
343 * @code: the error number
344 * @node: the location of the error.
345 * @extra: extra informations
346 *
347 * Handle an out of memory condition
348 */
349 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)350 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
351 {
352 const char *msg = NULL;
353
354 switch(code) {
355 case XML_SAVE_NOT_UTF8:
356 msg = "string is not in UTF-8\n";
357 break;
358 case XML_SAVE_CHAR_INVALID:
359 msg = "invalid character value\n";
360 break;
361 case XML_SAVE_UNKNOWN_ENCODING:
362 msg = "unknown encoding %s\n";
363 break;
364 case XML_SAVE_NO_DOCTYPE:
365 msg = "HTML has no DOCTYPE\n";
366 break;
367 default:
368 msg = "unexpected error number\n";
369 }
370 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
371 }
372
373 /************************************************************************
374 * *
375 * Dumping HTML tree content to a simple buffer *
376 * *
377 ************************************************************************/
378
379 static int
380 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
381 int format);
382
383 /**
384 * htmlNodeDumpFormat:
385 * @buf: the HTML buffer output
386 * @doc: the document
387 * @cur: the current node
388 * @format: should formatting spaces been added
389 *
390 * Dump an HTML node, recursive behaviour,children are printed too.
391 *
392 * Returns the number of byte written or -1 in case of error
393 */
394 static int
htmlNodeDumpFormat(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)395 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
396 int format) {
397 unsigned int use;
398 int ret;
399 xmlOutputBufferPtr outbuf;
400
401 if (cur == NULL) {
402 return (-1);
403 }
404 if (buf == NULL) {
405 return (-1);
406 }
407 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
408 if (outbuf == NULL) {
409 htmlSaveErrMemory("allocating HTML output buffer");
410 return (-1);
411 }
412 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
413 outbuf->buffer = buf;
414 outbuf->encoder = NULL;
415 outbuf->writecallback = NULL;
416 outbuf->closecallback = NULL;
417 outbuf->context = NULL;
418 outbuf->written = 0;
419
420 use = buf->use;
421 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
422 xmlFree(outbuf);
423 ret = buf->use - use;
424 return (ret);
425 }
426
427 /**
428 * htmlNodeDump:
429 * @buf: the HTML buffer output
430 * @doc: the document
431 * @cur: the current node
432 *
433 * Dump an HTML node, recursive behaviour,children are printed too,
434 * and formatting returns are added.
435 *
436 * Returns the number of byte written or -1 in case of error
437 */
438 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)439 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
440 xmlInitParser();
441
442 return(htmlNodeDumpFormat(buf, doc, cur, 1));
443 }
444
445 /**
446 * htmlNodeDumpFileFormat:
447 * @out: the FILE pointer
448 * @doc: the document
449 * @cur: the current node
450 * @encoding: the document encoding
451 * @format: should formatting spaces been added
452 *
453 * Dump an HTML node, recursive behaviour,children are printed too.
454 *
455 * TODO: if encoding == NULL try to save in the doc encoding
456 *
457 * returns: the number of byte written or -1 in case of failure.
458 */
459 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)460 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
461 xmlNodePtr cur, const char *encoding, int format) {
462 xmlOutputBufferPtr buf;
463 xmlCharEncodingHandlerPtr handler = NULL;
464 int ret;
465
466 xmlInitParser();
467
468 if (encoding != NULL) {
469 xmlCharEncoding enc;
470
471 enc = xmlParseCharEncoding(encoding);
472 if (enc != XML_CHAR_ENCODING_UTF8) {
473 handler = xmlFindCharEncodingHandler(encoding);
474 if (handler == NULL)
475 return(-1);
476 }
477 }
478
479 /*
480 * Fallback to HTML or ASCII when the encoding is unspecified
481 */
482 if (handler == NULL)
483 handler = xmlFindCharEncodingHandler("HTML");
484 if (handler == NULL)
485 handler = xmlFindCharEncodingHandler("ascii");
486
487 /*
488 * save the content to a temp buffer.
489 */
490 buf = xmlOutputBufferCreateFile(out, handler);
491 if (buf == NULL) return(0);
492
493 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
494
495 ret = xmlOutputBufferClose(buf);
496 return(ret);
497 }
498
499 /**
500 * htmlNodeDumpFile:
501 * @out: the FILE pointer
502 * @doc: the document
503 * @cur: the current node
504 *
505 * Dump an HTML node, recursive behaviour,children are printed too,
506 * and formatting returns are added.
507 */
508 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)509 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
510 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
511 }
512
513 /**
514 * htmlDocDumpMemoryFormat:
515 * @cur: the document
516 * @mem: OUT: the memory pointer
517 * @size: OUT: the memory length
518 * @format: should formatting spaces been added
519 *
520 * Dump an HTML document in memory and return the xmlChar * and it's size.
521 * It's up to the caller to free the memory.
522 */
523 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)524 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
525 xmlOutputBufferPtr buf;
526 xmlCharEncodingHandlerPtr handler = NULL;
527 const char *encoding;
528
529 xmlInitParser();
530
531 if ((mem == NULL) || (size == NULL))
532 return;
533 if (cur == NULL) {
534 *mem = NULL;
535 *size = 0;
536 return;
537 }
538
539 encoding = (const char *) htmlGetMetaEncoding(cur);
540
541 if (encoding != NULL) {
542 xmlCharEncoding enc;
543
544 enc = xmlParseCharEncoding(encoding);
545 if (enc != cur->charset) {
546 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
547 /*
548 * Not supported yet
549 */
550 *mem = NULL;
551 *size = 0;
552 return;
553 }
554
555 handler = xmlFindCharEncodingHandler(encoding);
556 if (handler == NULL) {
557 *mem = NULL;
558 *size = 0;
559 return;
560 }
561 } else {
562 handler = xmlFindCharEncodingHandler(encoding);
563 }
564 }
565
566 /*
567 * Fallback to HTML or ASCII when the encoding is unspecified
568 */
569 if (handler == NULL)
570 handler = xmlFindCharEncodingHandler("HTML");
571 if (handler == NULL)
572 handler = xmlFindCharEncodingHandler("ascii");
573
574 buf = xmlAllocOutputBufferInternal(handler);
575 if (buf == NULL) {
576 *mem = NULL;
577 *size = 0;
578 return;
579 }
580
581 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
582
583 xmlOutputBufferFlush(buf);
584 if (buf->conv != NULL) {
585 *size = buf->conv->use;
586 *mem = xmlStrndup(buf->conv->content, *size);
587 } else {
588 *size = buf->buffer->use;
589 *mem = xmlStrndup(buf->buffer->content, *size);
590 }
591 (void)xmlOutputBufferClose(buf);
592 }
593
594 /**
595 * htmlDocDumpMemory:
596 * @cur: the document
597 * @mem: OUT: the memory pointer
598 * @size: OUT: the memory length
599 *
600 * Dump an HTML document in memory and return the xmlChar * and it's size.
601 * It's up to the caller to free the memory.
602 */
603 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)604 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
605 htmlDocDumpMemoryFormat(cur, mem, size, 1);
606 }
607
608
609 /************************************************************************
610 * *
611 * Dumping HTML tree content to an I/O output buffer *
612 * *
613 ************************************************************************/
614
615 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
616
617 /**
618 * htmlDtdDumpOutput:
619 * @buf: the HTML buffer output
620 * @doc: the document
621 * @encoding: the encoding string
622 *
623 * TODO: check whether encoding is needed
624 *
625 * Dump the HTML document DTD, if any.
626 */
627 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)628 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
629 const char *encoding ATTRIBUTE_UNUSED) {
630 xmlDtdPtr cur = doc->intSubset;
631
632 if (cur == NULL) {
633 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
634 return;
635 }
636 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
637 xmlOutputBufferWriteString(buf, (const char *)cur->name);
638 if (cur->ExternalID != NULL) {
639 xmlOutputBufferWriteString(buf, " PUBLIC ");
640 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
641 if (cur->SystemID != NULL) {
642 xmlOutputBufferWriteString(buf, " ");
643 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
644 }
645 } else if (cur->SystemID != NULL) {
646 xmlOutputBufferWriteString(buf, " SYSTEM ");
647 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
648 }
649 xmlOutputBufferWriteString(buf, ">\n");
650 }
651
652 /**
653 * htmlAttrDumpOutput:
654 * @buf: the HTML buffer output
655 * @doc: the document
656 * @cur: the attribute pointer
657 * @encoding: the encoding string
658 *
659 * Dump an HTML attribute
660 */
661 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)662 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
663 const char *encoding ATTRIBUTE_UNUSED) {
664 xmlChar *value;
665
666 /*
667 * TODO: The html output method should not escape a & character
668 * occurring in an attribute value immediately followed by
669 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
670 */
671
672 if (cur == NULL) {
673 return;
674 }
675 xmlOutputBufferWriteString(buf, " ");
676 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
677 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
678 xmlOutputBufferWriteString(buf, ":");
679 }
680 xmlOutputBufferWriteString(buf, (const char *)cur->name);
681 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
682 value = xmlNodeListGetString(doc, cur->children, 0);
683 if (value) {
684 xmlOutputBufferWriteString(buf, "=");
685 if ((cur->ns == NULL) && (cur->parent != NULL) &&
686 (cur->parent->ns == NULL) &&
687 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
688 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
689 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
690 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
691 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
692 xmlChar *escaped;
693 xmlChar *tmp = value;
694
695 while (IS_BLANK_CH(*tmp)) tmp++;
696
697 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
698 if (escaped != NULL) {
699 xmlBufferWriteQuotedString(buf->buffer, escaped);
700 xmlFree(escaped);
701 } else {
702 xmlBufferWriteQuotedString(buf->buffer, value);
703 }
704 } else {
705 xmlBufferWriteQuotedString(buf->buffer, value);
706 }
707 xmlFree(value);
708 } else {
709 xmlOutputBufferWriteString(buf, "=\"\"");
710 }
711 }
712 }
713
714 /**
715 * htmlAttrListDumpOutput:
716 * @buf: the HTML buffer output
717 * @doc: the document
718 * @cur: the first attribute pointer
719 * @encoding: the encoding string
720 *
721 * Dump a list of HTML attributes
722 */
723 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)724 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
725 if (cur == NULL) {
726 return;
727 }
728 while (cur != NULL) {
729 htmlAttrDumpOutput(buf, doc, cur, encoding);
730 cur = cur->next;
731 }
732 }
733
734
735
736 /**
737 * htmlNodeListDumpOutput:
738 * @buf: the HTML buffer output
739 * @doc: the document
740 * @cur: the first node
741 * @encoding: the encoding string
742 * @format: should formatting spaces been added
743 *
744 * Dump an HTML node list, recursive behaviour,children are printed too.
745 */
746 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)747 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
748 xmlNodePtr cur, const char *encoding, int format) {
749 if (cur == NULL) {
750 return;
751 }
752 while (cur != NULL) {
753 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
754 cur = cur->next;
755 }
756 }
757
758 /**
759 * htmlNodeDumpFormatOutput:
760 * @buf: the HTML buffer output
761 * @doc: the document
762 * @cur: the current node
763 * @encoding: the encoding string
764 * @format: should formatting spaces been added
765 *
766 * Dump an HTML node, recursive behaviour,children are printed too.
767 */
768 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)769 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
770 xmlNodePtr cur, const char *encoding, int format) {
771 const htmlElemDesc * info;
772
773 xmlInitParser();
774
775 if ((cur == NULL) || (buf == NULL)) {
776 return;
777 }
778 /*
779 * Special cases.
780 */
781 if (cur->type == XML_DTD_NODE)
782 return;
783 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
784 (cur->type == XML_DOCUMENT_NODE)){
785 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
786 return;
787 }
788 if (cur->type == XML_ATTRIBUTE_NODE) {
789 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
790 return;
791 }
792 if (cur->type == HTML_TEXT_NODE) {
793 if (cur->content != NULL) {
794 if (((cur->name == (const xmlChar *)xmlStringText) ||
795 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
796 ((cur->parent == NULL) ||
797 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
798 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
799 xmlChar *buffer;
800
801 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
802 if (buffer != NULL) {
803 xmlOutputBufferWriteString(buf, (const char *)buffer);
804 xmlFree(buffer);
805 }
806 } else {
807 xmlOutputBufferWriteString(buf, (const char *)cur->content);
808 }
809 }
810 return;
811 }
812 if (cur->type == HTML_COMMENT_NODE) {
813 if (cur->content != NULL) {
814 xmlOutputBufferWriteString(buf, "<!--");
815 xmlOutputBufferWriteString(buf, (const char *)cur->content);
816 xmlOutputBufferWriteString(buf, "-->");
817 }
818 return;
819 }
820 if (cur->type == HTML_PI_NODE) {
821 if (cur->name == NULL)
822 return;
823 xmlOutputBufferWriteString(buf, "<?");
824 xmlOutputBufferWriteString(buf, (const char *)cur->name);
825 if (cur->content != NULL) {
826 xmlOutputBufferWriteString(buf, " ");
827 xmlOutputBufferWriteString(buf, (const char *)cur->content);
828 }
829 xmlOutputBufferWriteString(buf, ">");
830 return;
831 }
832 if (cur->type == HTML_ENTITY_REF_NODE) {
833 xmlOutputBufferWriteString(buf, "&");
834 xmlOutputBufferWriteString(buf, (const char *)cur->name);
835 xmlOutputBufferWriteString(buf, ";");
836 return;
837 }
838 if (cur->type == HTML_PRESERVE_NODE) {
839 if (cur->content != NULL) {
840 xmlOutputBufferWriteString(buf, (const char *)cur->content);
841 }
842 return;
843 }
844
845 /*
846 * Get specific HTML info for that node.
847 */
848 if (cur->ns == NULL)
849 info = htmlTagLookup(cur->name);
850 else
851 info = NULL;
852
853 xmlOutputBufferWriteString(buf, "<");
854 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
855 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
856 xmlOutputBufferWriteString(buf, ":");
857 }
858 xmlOutputBufferWriteString(buf, (const char *)cur->name);
859 if (cur->nsDef)
860 xmlNsListDumpOutput(buf, cur->nsDef);
861 if (cur->properties != NULL)
862 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
863
864 if ((info != NULL) && (info->empty)) {
865 xmlOutputBufferWriteString(buf, ">");
866 if ((format) && (!info->isinline) && (cur->next != NULL)) {
867 if ((cur->next->type != HTML_TEXT_NODE) &&
868 (cur->next->type != HTML_ENTITY_REF_NODE) &&
869 (cur->parent != NULL) &&
870 (cur->parent->name != NULL) &&
871 (cur->parent->name[0] != 'p')) /* p, pre, param */
872 xmlOutputBufferWriteString(buf, "\n");
873 }
874 return;
875 }
876 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
877 (cur->children == NULL)) {
878 if ((info != NULL) && (info->saveEndTag != 0) &&
879 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
880 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
881 xmlOutputBufferWriteString(buf, ">");
882 } else {
883 xmlOutputBufferWriteString(buf, "></");
884 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
885 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
886 xmlOutputBufferWriteString(buf, ":");
887 }
888 xmlOutputBufferWriteString(buf, (const char *)cur->name);
889 xmlOutputBufferWriteString(buf, ">");
890 }
891 if ((format) && (cur->next != NULL) &&
892 (info != NULL) && (!info->isinline)) {
893 if ((cur->next->type != HTML_TEXT_NODE) &&
894 (cur->next->type != HTML_ENTITY_REF_NODE) &&
895 (cur->parent != NULL) &&
896 (cur->parent->name != NULL) &&
897 (cur->parent->name[0] != 'p')) /* p, pre, param */
898 xmlOutputBufferWriteString(buf, "\n");
899 }
900 return;
901 }
902 xmlOutputBufferWriteString(buf, ">");
903 if ((cur->type != XML_ELEMENT_NODE) &&
904 (cur->content != NULL)) {
905 /*
906 * Uses the OutputBuffer property to automatically convert
907 * invalids to charrefs
908 */
909
910 xmlOutputBufferWriteString(buf, (const char *) cur->content);
911 }
912 if (cur->children != NULL) {
913 if ((format) && (info != NULL) && (!info->isinline) &&
914 (cur->children->type != HTML_TEXT_NODE) &&
915 (cur->children->type != HTML_ENTITY_REF_NODE) &&
916 (cur->children != cur->last) &&
917 (cur->name != NULL) &&
918 (cur->name[0] != 'p')) /* p, pre, param */
919 xmlOutputBufferWriteString(buf, "\n");
920 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
921 if ((format) && (info != NULL) && (!info->isinline) &&
922 (cur->last->type != HTML_TEXT_NODE) &&
923 (cur->last->type != HTML_ENTITY_REF_NODE) &&
924 (cur->children != cur->last) &&
925 (cur->name != NULL) &&
926 (cur->name[0] != 'p')) /* p, pre, param */
927 xmlOutputBufferWriteString(buf, "\n");
928 }
929 xmlOutputBufferWriteString(buf, "</");
930 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
931 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
932 xmlOutputBufferWriteString(buf, ":");
933 }
934 xmlOutputBufferWriteString(buf, (const char *)cur->name);
935 xmlOutputBufferWriteString(buf, ">");
936 if ((format) && (info != NULL) && (!info->isinline) &&
937 (cur->next != NULL)) {
938 if ((cur->next->type != HTML_TEXT_NODE) &&
939 (cur->next->type != HTML_ENTITY_REF_NODE) &&
940 (cur->parent != NULL) &&
941 (cur->parent->name != NULL) &&
942 (cur->parent->name[0] != 'p')) /* p, pre, param */
943 xmlOutputBufferWriteString(buf, "\n");
944 }
945 }
946
947 /**
948 * htmlNodeDumpOutput:
949 * @buf: the HTML buffer output
950 * @doc: the document
951 * @cur: the current node
952 * @encoding: the encoding string
953 *
954 * Dump an HTML node, recursive behaviour,children are printed too,
955 * and formatting returns/spaces are added.
956 */
957 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)958 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
959 xmlNodePtr cur, const char *encoding) {
960 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
961 }
962
963 /**
964 * htmlDocContentDumpFormatOutput:
965 * @buf: the HTML buffer output
966 * @cur: the document
967 * @encoding: the encoding string
968 * @format: should formatting spaces been added
969 *
970 * Dump an HTML document.
971 */
972 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)973 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
974 const char *encoding, int format) {
975 int type;
976
977 xmlInitParser();
978
979 if ((buf == NULL) || (cur == NULL))
980 return;
981
982 /*
983 * force to output the stuff as HTML, especially for entities
984 */
985 type = cur->type;
986 cur->type = XML_HTML_DOCUMENT_NODE;
987 if (cur->intSubset != NULL) {
988 htmlDtdDumpOutput(buf, cur, NULL);
989 }
990 if (cur->children != NULL) {
991 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
992 }
993 xmlOutputBufferWriteString(buf, "\n");
994 cur->type = (xmlElementType) type;
995 }
996
997 /**
998 * htmlDocContentDumpOutput:
999 * @buf: the HTML buffer output
1000 * @cur: the document
1001 * @encoding: the encoding string
1002 *
1003 * Dump an HTML document. Formating return/spaces are added.
1004 */
1005 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1006 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1007 const char *encoding) {
1008 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1009 }
1010
1011 /************************************************************************
1012 * *
1013 * Saving functions front-ends *
1014 * *
1015 ************************************************************************/
1016
1017 /**
1018 * htmlDocDump:
1019 * @f: the FILE*
1020 * @cur: the document
1021 *
1022 * Dump an HTML document to an open FILE.
1023 *
1024 * returns: the number of byte written or -1 in case of failure.
1025 */
1026 int
htmlDocDump(FILE * f,xmlDocPtr cur)1027 htmlDocDump(FILE *f, xmlDocPtr cur) {
1028 xmlOutputBufferPtr buf;
1029 xmlCharEncodingHandlerPtr handler = NULL;
1030 const char *encoding;
1031 int ret;
1032
1033 xmlInitParser();
1034
1035 if ((cur == NULL) || (f == NULL)) {
1036 return(-1);
1037 }
1038
1039 encoding = (const char *) htmlGetMetaEncoding(cur);
1040
1041 if (encoding != NULL) {
1042 xmlCharEncoding enc;
1043
1044 enc = xmlParseCharEncoding(encoding);
1045 if (enc != cur->charset) {
1046 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1047 /*
1048 * Not supported yet
1049 */
1050 return(-1);
1051 }
1052
1053 handler = xmlFindCharEncodingHandler(encoding);
1054 if (handler == NULL)
1055 return(-1);
1056 } else {
1057 handler = xmlFindCharEncodingHandler(encoding);
1058 }
1059 }
1060
1061 /*
1062 * Fallback to HTML or ASCII when the encoding is unspecified
1063 */
1064 if (handler == NULL)
1065 handler = xmlFindCharEncodingHandler("HTML");
1066 if (handler == NULL)
1067 handler = xmlFindCharEncodingHandler("ascii");
1068
1069 buf = xmlOutputBufferCreateFile(f, handler);
1070 if (buf == NULL) return(-1);
1071 htmlDocContentDumpOutput(buf, cur, NULL);
1072
1073 ret = xmlOutputBufferClose(buf);
1074 return(ret);
1075 }
1076
1077 /**
1078 * htmlSaveFile:
1079 * @filename: the filename (or URL)
1080 * @cur: the document
1081 *
1082 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1083 * used.
1084 * returns: the number of byte written or -1 in case of failure.
1085 */
1086 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1087 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1088 xmlOutputBufferPtr buf;
1089 xmlCharEncodingHandlerPtr handler = NULL;
1090 const char *encoding;
1091 int ret;
1092
1093 if ((cur == NULL) || (filename == NULL))
1094 return(-1);
1095
1096 xmlInitParser();
1097
1098 encoding = (const char *) htmlGetMetaEncoding(cur);
1099
1100 if (encoding != NULL) {
1101 xmlCharEncoding enc;
1102
1103 enc = xmlParseCharEncoding(encoding);
1104 if (enc != cur->charset) {
1105 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1106 /*
1107 * Not supported yet
1108 */
1109 return(-1);
1110 }
1111
1112 handler = xmlFindCharEncodingHandler(encoding);
1113 if (handler == NULL)
1114 return(-1);
1115 }
1116 }
1117
1118 /*
1119 * Fallback to HTML or ASCII when the encoding is unspecified
1120 */
1121 if (handler == NULL)
1122 handler = xmlFindCharEncodingHandler("HTML");
1123 if (handler == NULL)
1124 handler = xmlFindCharEncodingHandler("ascii");
1125
1126 /*
1127 * save the content to a temp buffer.
1128 */
1129 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1130 if (buf == NULL) return(0);
1131
1132 htmlDocContentDumpOutput(buf, cur, NULL);
1133
1134 ret = xmlOutputBufferClose(buf);
1135 return(ret);
1136 }
1137
1138 /**
1139 * htmlSaveFileFormat:
1140 * @filename: the filename
1141 * @cur: the document
1142 * @format: should formatting spaces been added
1143 * @encoding: the document encoding
1144 *
1145 * Dump an HTML document to a file using a given encoding.
1146 *
1147 * returns: the number of byte written or -1 in case of failure.
1148 */
1149 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1150 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1151 const char *encoding, int format) {
1152 xmlOutputBufferPtr buf;
1153 xmlCharEncodingHandlerPtr handler = NULL;
1154 int ret;
1155
1156 if ((cur == NULL) || (filename == NULL))
1157 return(-1);
1158
1159 xmlInitParser();
1160
1161 if (encoding != NULL) {
1162 xmlCharEncoding enc;
1163
1164 enc = xmlParseCharEncoding(encoding);
1165 if (enc != cur->charset) {
1166 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1167 /*
1168 * Not supported yet
1169 */
1170 return(-1);
1171 }
1172
1173 handler = xmlFindCharEncodingHandler(encoding);
1174 if (handler == NULL)
1175 return(-1);
1176 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1177 }
1178 } else {
1179 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1180 }
1181
1182 /*
1183 * Fallback to HTML or ASCII when the encoding is unspecified
1184 */
1185 if (handler == NULL)
1186 handler = xmlFindCharEncodingHandler("HTML");
1187 if (handler == NULL)
1188 handler = xmlFindCharEncodingHandler("ascii");
1189
1190 /*
1191 * save the content to a temp buffer.
1192 */
1193 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1194 if (buf == NULL) return(0);
1195
1196 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1197
1198 ret = xmlOutputBufferClose(buf);
1199 return(ret);
1200 }
1201
1202 /**
1203 * htmlSaveFileEnc:
1204 * @filename: the filename
1205 * @cur: the document
1206 * @encoding: the document encoding
1207 *
1208 * Dump an HTML document to a file using a given encoding
1209 * and formatting returns/spaces are added.
1210 *
1211 * returns: the number of byte written or -1 in case of failure.
1212 */
1213 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1214 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1215 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1216 }
1217
1218 #endif /* LIBXML_OUTPUT_ENABLED */
1219
1220 #define bottom_HTMLtree
1221 #include "elfgcchack.h"
1222 #endif /* LIBXML_HTML_ENABLED */
1223