/* libxml2 - Library for parsing XML documents * Copyright (C) 2006-2019 Free Software Foundation, Inc. * * This file is not part of the GNU gettext program, but is used with * GNU gettext. * * The original copyright notice is as follows: */ /* * Copyright (C) 1998-2012 Daniel Veillard. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is fur- * nished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT- * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * * daniel@veillard.com */ /* * HTMLtree.c : implementation of access function for an HTML tree. */ #define IN_LIBXML #include "libxml.h" #ifdef LIBXML_HTML_ENABLED #include /* for memset() only ! */ #ifdef HAVE_CTYPE_H #include #endif #ifdef HAVE_STDLIB_H #include #endif #include #include #include #include #include #include #include #include #include #include "buf.h" /************************************************************************ * * * Getting/Setting encoding meta tags * * * ************************************************************************/ /** * htmlGetMetaEncoding: * @doc: the document * * Encoding definition lookup in the Meta tags * * Returns the current encoding as flagged in the HTML source */ const xmlChar * htmlGetMetaEncoding(htmlDocPtr doc) { htmlNodePtr cur; const xmlChar *content; const xmlChar *encoding; if (doc == NULL) return(NULL); cur = doc->children; /* * Search the html */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"html")) break; if (xmlStrEqual(cur->name, BAD_CAST"head")) goto found_head; if (xmlStrEqual(cur->name, BAD_CAST"meta")) goto found_meta; } cur = cur->next; } if (cur == NULL) return(NULL); cur = cur->children; /* * Search the head */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"head")) break; if (xmlStrEqual(cur->name, BAD_CAST"meta")) goto found_meta; } cur = cur->next; } if (cur == NULL) return(NULL); found_head: cur = cur->children; /* * Search the meta elements */ found_meta: while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"meta")) { xmlAttrPtr attr = cur->properties; int http; const xmlChar *value; content = NULL; http = 0; while (attr != NULL) { if ((attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL)) { value = attr->children->content; if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else if ((value != NULL) && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) content = value; if ((http != 0) && (content != NULL)) goto found_content; } attr = attr->next; } } } cur = cur->next; } return(NULL); found_content: encoding = xmlStrstr(content, BAD_CAST"charset="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"Charset="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"CHARSET="); if (encoding != NULL) { encoding += 8; } else { encoding = xmlStrstr(content, BAD_CAST"charset ="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"Charset ="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); if (encoding != NULL) encoding += 9; } if (encoding != NULL) { while ((*encoding == ' ') || (*encoding == '\t')) encoding++; } return(encoding); } /** * htmlSetMetaEncoding: * @doc: the document * @encoding: the encoding string * * Sets the current encoding in the Meta tags * NOTE: this will not change the document content encoding, just * the META flag associated. * * Returns 0 in case of success and -1 in case of error */ int htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { htmlNodePtr cur, meta = NULL, head = NULL; const xmlChar *content = NULL; char newcontent[100]; newcontent[0] = 0; if (doc == NULL) return(-1); /* html isn't a real encoding it's just libxml2 way to get entities */ if (!xmlStrcasecmp(encoding, BAD_CAST "html")) return(-1); if (encoding != NULL) { snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", (char *)encoding); newcontent[sizeof(newcontent) - 1] = 0; } cur = doc->children; /* * Search the html */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) break; if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) goto found_head; if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) goto found_meta; } cur = cur->next; } if (cur == NULL) return(-1); cur = cur->children; /* * Search the head */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) break; if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { head = cur->parent; goto found_meta; } } cur = cur->next; } if (cur == NULL) return(-1); found_head: head = cur; if (cur->children == NULL) goto create; cur = cur->children; found_meta: /* * Search and update all the remaining the meta elements carrying * encoding informations */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { xmlAttrPtr attr = cur->properties; int http; const xmlChar *value; content = NULL; http = 0; while (attr != NULL) { if ((attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL)) { value = attr->children->content; if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else { if ((value != NULL) && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) content = value; } if ((http != 0) && (content != NULL)) break; } attr = attr->next; } if ((http != 0) && (content != NULL)) { meta = cur; break; } } } cur = cur->next; } create: if (meta == NULL) { if ((encoding != NULL) && (head != NULL)) { /* * Create a new Meta element with the right attributes */ meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); if (head->children == NULL) xmlAddChild(head, meta); else xmlAddPrevSibling(head->children, meta); xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); } } else { /* remove the meta tag if NULL is passed */ if (encoding == NULL) { xmlUnlinkNode(meta); xmlFreeNode(meta); } /* change the document only if there is a real encoding change */ else if (xmlStrcasestr(content, encoding) == NULL) { xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); } } return(0); } /** * booleanHTMLAttrs: * * These are the HTML attributes which will be output * in minimized form, i.e.