/* * Copyright (C) 2009 Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // How we handle the base tag better. // Current status: // At now the normal way we use to handling base tag is // a) For those links which have corresponding local saved files, such as // savable CSS, JavaScript files, they will be written to relative URLs which // point to local saved file. Why those links can not be resolved as absolute // file URLs, because if they are resolved as absolute URLs, after moving the // file location from one directory to another directory, the file URLs will // be dead links. // b) For those links which have not corresponding local saved files, such as // links in A, AREA tags, they will be resolved as absolute URLs. // c) We comment all base tags when serialzing DOM for the page. // FireFox also uses above way to handle base tag. // // Problem: // This way can not handle the following situation: // the base tag is written by JavaScript. // For example. The page "www.yahoo.com" use // "document.write(' to DOM, so all URLs which point to // local saved resource files will be resolved as // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource // files can not be loaded correctly. Also the page will be rendered ugly since // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame // files can not be fetched. // Now FireFox, IE and WebKit based Browser all have this problem. // // Solution: // My solution is that we comment old base tag and write new base tag: // after the previous commented base tag. In WebKit, it // always uses the latest "href" attribute of base tag to set document's base // URL. Based on this behavior, when we encounter a base tag, we comment it and // write a new base tag after the previous commented base tag. // The new added base tag can help engine to locate correct base URL for // correctly loading local saved resource files. Also I think we need to inherit // the base target value from document object when appending new base tag. // If there are multiple base tags in original document, we will comment all old // base tags and append new base tag after each old base tag because we do not // know those old base tags are original content or added by JavaScript. If // they are added by JavaScript, it means when loading saved page, the script(s) // will still insert base tag(s) to DOM, so the new added base tag(s) can // override the incorrect base URL and make sure we alway load correct local // saved resource files. #include "config.h" #include "WebPageSerializerImpl.h" #include "Document.h" #include "DocumentType.h" #include "Element.h" #include "FrameLoader.h" #include "HTMLAllCollection.h" #include "HTMLElement.h" #include "HTMLFormElement.h" #include "HTMLMetaElement.h" #include "HTMLNames.h" #include "KURL.h" #include "PlatformString.h" #include "StringBuilder.h" #include "TextEncoding.h" #include "markup.h" #include "DOMUtilitiesPrivate.h" #include "WebFrameImpl.h" #include "WebURL.h" #include "WebVector.h" using namespace WebCore; namespace WebKit { // Maximum length of data buffer which is used to temporary save generated // html content data. This is a soft limit which might be passed if a very large // contegious string is found in the page. static const unsigned dataBufferCapacity = 65536; WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& currentFrameURL, const TextEncoding& textEncoding, Document* doc, const String& directoryName) : currentFrameURL(currentFrameURL) , textEncoding(textEncoding) , doc(doc) , directoryName(directoryName) , hasDoctype(false) , hasCheckedMeta(false) , skipMetaElement(0) , isInScriptOrStyleTag(false) , hasDocDeclaration(false) { // Cache the value since we check it lots of times. isHTMLDocument = doc->isHTMLDocument(); } String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( const Element* element, SerializeDomParam* param, bool* needSkip) { StringBuilder result; *needSkip = false; if (param->isHTMLDocument) { // Skip the open tag of original META tag which declare charset since we // have overrided the META which have correct charset declaration after // serializing open tag of HEAD element. if (element->hasTagName(HTMLNames::metaTag)) { const HTMLMetaElement* meta = static_cast(element); // Check whether the META tag has declared charset or not. String equiv = meta->httpEquiv(); if (equalIgnoringCase(equiv, "content-type")) { String content = meta->content(); if (content.length() && content.contains("charset", false)) { // Find META tag declared charset, we need to skip it when // serializing DOM. param->skipMetaElement = element; *needSkip = true; } } } else if (element->hasTagName(HTMLNames::htmlTag)) { // Check something before processing the open tag of HEAD element. // First we add doc type declaration if original doc has it. if (!param->hasDoctype) { param->hasDoctype = true; result.append(createMarkup(param->doc->doctype())); } // Add MOTW declaration before html tag. // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->currentFrameURL)); } else if (element->hasTagName(HTMLNames::baseTag)) { // Comment the BASE tag when serializing dom. result.append(""); // Append a new base tag declaration. result.append(WebPageSerializer::generateBaseTagDeclaration( param->doc->baseTarget())); } return result.toString(); } void WebPageSerializerImpl::saveHTMLContentToBuffer( const String& result, SerializeDomParam* param) { m_dataBuffer.append(result); encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, param, 0); } void WebPageSerializerImpl::encodeAndFlushBuffer( WebPageSerializerClient::PageSerializationStatus status, SerializeDomParam* param, bool force) { // Data buffer is not full nor do we want to force flush. if (!force && m_dataBuffer.length() <= dataBufferCapacity) return; String content = m_dataBuffer.toString(); m_dataBuffer.clear(); // Convert the unicode content to target encoding CString encodedContent = param->textEncoding.encode( content.characters(), content.length(), EntitiesForUnencodables); // Send result to the client. m_client->didSerializeDataForFrame(param->currentFrameURL, WebCString(encodedContent.data(), encodedContent.length()), status); } void WebPageSerializerImpl::openTagToString(const Element* element, SerializeDomParam* param) { // FIXME: use StringBuilder instead of String. bool needSkip; // Do pre action for open tag. String result = preActionBeforeSerializeOpenTag(element, param, &needSkip); if (needSkip) return; // Add open tag result += "<" + element->nodeName(); // Go through all attributes and serialize them. const NamedNodeMap *attrMap = element->attributes(true); if (attrMap) { unsigned numAttrs = attrMap->length(); for (unsigned i = 0; i < numAttrs; i++) { result += " "; // Add attribute pair const Attribute *attribute = attrMap->attributeItem(i); result += attribute->name().toString(); result += "=\""; if (!attribute->value().isEmpty()) { const String& attrValue = attribute->value(); // Check whether we need to replace some resource links // with local resource paths. const QualifiedName& attrName = attribute->name(); if (elementHasLegalLinkAttribute(element, attrName)) { // For links start with "javascript:", we do not change it. if (attrValue.startsWith("javascript:", false)) result += attrValue; else { // Get the absolute link String completeURL = param->doc->completeURL(attrValue); // Check whether we have local files for those link. if (m_localLinks.contains(completeURL)) { if (!m_localDirectoryName.isEmpty()) result += "./" + m_localDirectoryName + "/"; result += m_localLinks.get(completeURL); } else result += completeURL; } } else { if (param->isHTMLDocument) result += m_htmlEntities.convertEntitiesInString(attrValue); else result += m_xmlEntities.convertEntitiesInString(attrValue); } } result += "\""; } } // Do post action for open tag. String addedContents = postActionAfterSerializeOpenTag(element, param); // Complete the open tag for element when it has child/children. if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) result += ">"; // Append the added contents generate in post action of open tag. result += addedContents; // Save the result to data buffer. saveHTMLContentToBuffer(result, param); } // Serialize end tag of an specified element. void WebPageSerializerImpl::endTagToString(const Element* element, SerializeDomParam* param) { bool needSkip; // Do pre action for end tag. String result = preActionBeforeSerializeEndTag(element, param, &needSkip); if (needSkip) return; // Write end tag when element has child/children. if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) { result += "nodeName(); result += ">"; } else { // Check whether we have to write end tag for empty element. if (param->isHTMLDocument) { result += ">"; const HTMLElement* htmlElement = static_cast(element); if (htmlElement->endTagRequirement() == TagStatusRequired) { // We need to write end tag when it is required. result += "nodeName(); result += ">"; } } else { // For xml base document. result += " />"; } } // Do post action for end tag. result += postActionAfterSerializeEndTag(element, param); // Save the result to data buffer. saveHTMLContentToBuffer(result, param); } void WebPageSerializerImpl::buildContentForNode(const Node* node, SerializeDomParam* param) { switch (node->nodeType()) { case Node::ELEMENT_NODE: // Process open tag of element. openTagToString(static_cast(node), param); // Walk through the children nodes and process it. for (const Node *child = node->firstChild(); child; child = child->nextSibling()) buildContentForNode(child, param); // Process end tag of element. endTagToString(static_cast(node), param); break; case Node::TEXT_NODE: saveHTMLContentToBuffer(createMarkup(node), param); break; case Node::ATTRIBUTE_NODE: case Node::DOCUMENT_NODE: case Node::DOCUMENT_FRAGMENT_NODE: // Should not exist. ASSERT_NOT_REACHED(); break; // Document type node can be in DOM? case Node::DOCUMENT_TYPE_NODE: param->hasDoctype = true; default: // For other type node, call default action. saveHTMLContentToBuffer(createMarkup(node), param); break; } } WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, bool recursiveSerialization, WebPageSerializerClient* client, const WebVector& links, const WebVector& localPaths, const WebString& localDirectoryName) : m_client(client) , m_recursiveSerialization(recursiveSerialization) , m_framesCollected(false) , m_localDirectoryName(localDirectoryName) , m_htmlEntities(false) , m_xmlEntities(true) { // Must specify available webframe. ASSERT(frame); m_specifiedWebFrameImpl = static_cast(frame); // Make sure we have non 0 client. ASSERT(client); // Build local resources map. ASSERT(links.size() == localPaths.size()); for (size_t i = 0; i < links.size(); i++) { KURL url = links[i]; ASSERT(!m_localLinks.contains(url.string())); m_localLinks.set(url.string(), localPaths[i]); } ASSERT(!m_dataBuffer.length()); } void WebPageSerializerImpl::collectTargetFrames() { ASSERT(!m_framesCollected); m_framesCollected = true; // First, process main frame. m_frames.append(m_specifiedWebFrameImpl); // Return now if user only needs to serialize specified frame, not including // all sub-frames. if (!m_recursiveSerialization) return; // Collect all frames inside the specified frame. for (int i = 0; i < static_cast(m_frames.size()); ++i) { WebFrameImpl* currentFrame = m_frames[i]; // Get current using document. Document* currentDoc = currentFrame->frame()->document(); // Go through sub-frames. RefPtr all = currentDoc->all(); for (Node* node = all->firstItem(); node; node = all->nextItem()) { if (!node->isHTMLElement()) continue; Element* element = static_cast(node); WebFrameImpl* webFrame = WebFrameImpl::fromFrameOwnerElement(element); if (webFrame) m_frames.append(webFrame); } } } bool WebPageSerializerImpl::serialize() { // Collect target frames. if (!m_framesCollected) collectTargetFrames(); bool didSerialization = false; // Get KURL for main frame. KURL mainPageURL = m_specifiedWebFrameImpl->frame()->loader()->url(); // Go through all frames for serializing DOM for whole page, include // sub-frames. for (int i = 0; i < static_cast(m_frames.size()); ++i) { // Get current serializing frame. WebFrameImpl* currentFrame = m_frames[i]; // Get current using document. Document* currentDoc = currentFrame->frame()->document(); // Get current frame's URL. const KURL& currentFrameURL = currentFrame->frame()->loader()->url(); // Check whether we have done this document. if (m_localLinks.contains(currentFrameURL.string())) { // A new document, we will serialize it. didSerialization = true; // Get target encoding for current document. String encoding = currentFrame->frame()->loader()->encoding(); // Create the text encoding object with target encoding. TextEncoding textEncoding(encoding); // Construct serialize parameter for late processing document. SerializeDomParam param(currentFrameURL, encoding.length() ? textEncoding : UTF8Encoding(), currentDoc, currentFrameURL == mainPageURL ? m_localDirectoryName : ""); // Process current document. Element* rootElement = currentDoc->documentElement(); if (rootElement) buildContentForNode(rootElement, ¶m); // Flush the remainder data and finish serializing current frame. encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, 1); } } // We have done call frames, so we send message to embedder to tell it that // frames are finished serializing. ASSERT(!m_dataBuffer.length()); m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished); return didSerialization; } } // namespace WebKit