/* * Copyright (C) 2009 Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // How we handle the base tag better. // Current status: // At now the normal way we use to handling base tag is // a) For those links which have corresponding local saved files, such as // savable CSS, JavaScript files, they will be written to relative URLs which // point to local saved file. Why those links can not be resolved as absolute // file URLs, because if they are resolved as absolute URLs, after moving the // file location from one directory to another directory, the file URLs will // be dead links. // b) For those links which have not corresponding local saved files, such as // links in A, AREA tags, they will be resolved as absolute URLs. // c) We comment all base tags when serialzing DOM for the page. // FireFox also uses above way to handle base tag. // // Problem: // This way can not handle the following situation: // the base tag is written by JavaScript. // For example. The page "www.yahoo.com" use // "document.write(' to DOM, so all URLs which point to // local saved resource files will be resolved as // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource // files can not be loaded correctly. Also the page will be rendered ugly since // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame // files can not be fetched. // Now FireFox, IE and WebKit based Browser all have this problem. // // Solution: // My solution is that we comment old base tag and write new base tag: // after the previous commented base tag. In WebKit, it // always uses the latest "href" attribute of base tag to set document's base // URL. Based on this behavior, when we encounter a base tag, we comment it and // write a new base tag after the previous commented base tag. // The new added base tag can help engine to locate correct base URL for // correctly loading local saved resource files. Also I think we need to inherit // the base target value from document object when appending new base tag. // If there are multiple base tags in original document, we will comment all old // base tags and append new base tag after each old base tag because we do not // know those old base tags are original content or added by JavaScript. If // they are added by JavaScript, it means when loading saved page, the script(s) // will still insert base tag(s) to DOM, so the new added base tag(s) can // override the incorrect base URL and make sure we alway load correct local // saved resource files. #include "config.h" #include "web/WebPageSerializerImpl.h" #include "core/HTMLNames.h" #include "core/dom/Document.h" #include "core/dom/DocumentType.h" #include "core/dom/Element.h" #include "core/editing/markup.h" #include "core/html/HTMLAllCollection.h" #include "core/html/HTMLElement.h" #include "core/html/HTMLFormElement.h" #include "core/html/HTMLHtmlElement.h" #include "core/html/HTMLMetaElement.h" #include "core/loader/DocumentLoader.h" #include "core/loader/FrameLoader.h" #include "public/platform/WebVector.h" #include "web/WebLocalFrameImpl.h" #include "wtf/text/TextEncoding.h" namespace blink { // Maximum length of data buffer which is used to temporary save generated // html content data. This is a soft limit which might be passed if a very large // contegious string is found in the page. static const unsigned dataBufferCapacity = 65536; WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, const WTF::TextEncoding& textEncoding, Document* document, const String& directoryName) : url(url) , textEncoding(textEncoding) , document(document) , directoryName(directoryName) , isHTMLDocument(document->isHTMLDocument()) , haveSeenDocType(false) , haveAddedCharsetDeclaration(false) , skipMetaElement(0) , isInScriptOrStyleTag(false) , haveAddedXMLProcessingDirective(false) , haveAddedContentsBeforeEnd(false) { } String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( const Element* element, SerializeDomParam* param, bool* needSkip) { StringBuilder result; *needSkip = false; if (param->isHTMLDocument) { // Skip the open tag of original META tag which declare charset since we // have overrided the META which have correct charset declaration after // serializing open tag of HEAD element. ASSERT(element); if (isHTMLMetaElement(*element)) { const HTMLMetaElement& meta = toHTMLMetaElement(*element); // Check whether the META tag has declared charset or not. String equiv = meta.httpEquiv(); if (equalIgnoringCase(equiv, "content-type")) { String content = meta.content(); if (content.length() && content.contains("charset", false)) { // Find META tag declared charset, we need to skip it when // serializing DOM. param->skipMetaElement = element; *needSkip = true; } } } else if (isHTMLHtmlElement(*element)) { // Check something before processing the open tag of HEAD element. // First we add doc type declaration if original document has it. if (!param->haveSeenDocType) { param->haveSeenDocType = true; result.append(createMarkup(param->document->doctype())); } // Add MOTW declaration before html tag. // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url)); } else if (isHTMLBaseElement(*element)) { // Comment the BASE tag when serializing dom. result.appendLiteral(""); // Append a new base tag declaration. result.append(WebPageSerializer::generateBaseTagDeclaration( param->document->baseTarget())); } return result.toString(); } void WebPageSerializerImpl::saveHTMLContentToBuffer( const String& result, SerializeDomParam* param) { m_dataBuffer.append(result); encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, param, DoNotForceFlush); } void WebPageSerializerImpl::encodeAndFlushBuffer( WebPageSerializerClient::PageSerializationStatus status, SerializeDomParam* param, FlushOption flushOption) { // Data buffer is not full nor do we want to force flush. if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity) return; String content = m_dataBuffer.toString(); m_dataBuffer.clear(); CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables); // Send result to the client. m_client->didSerializeDataForFrame(param->url, WebCString(encodedContent.data(), encodedContent.length()), status); } void WebPageSerializerImpl::openTagToString(Element* element, SerializeDomParam* param) { bool needSkip; StringBuilder result; // Do pre action for open tag. result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip)); if (needSkip) return; // Add open tag result.append('<'); result.append(element->nodeName().lower()); // Go through all attributes and serialize them. AttributeCollection attributes = element->attributes(); AttributeCollection::iterator end = attributes.end(); for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) { result.append(' '); // Add attribute pair result.append(it->name().toString()); result.appendLiteral("=\""); if (!it->value().isEmpty()) { const String& attrValue = it->value(); // Check whether we need to replace some resource links // with local resource paths. const QualifiedName& attrName = it->name(); if (element->hasLegalLinkAttribute(attrName)) { // For links start with "javascript:", we do not change it. if (attrValue.startsWith("javascript:", false)) { result.append(attrValue); } else { // Get the absolute link WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element); String completeURL = subFrame ? subFrame->frame()->document()->url() : param->document->completeURL(attrValue); // Check whether we have local files for those link. if (m_localLinks.contains(completeURL)) { if (!param->directoryName.isEmpty()) { result.appendLiteral("./"); result.append(param->directoryName); result.append('/'); } result.append(m_localLinks.get(completeURL)); } else { result.append(completeURL); } } } else { if (param->isHTMLDocument) result.append(m_htmlEntities.convertEntitiesInString(attrValue)); else result.append(m_xmlEntities.convertEntitiesInString(attrValue)); } } result.append('\"'); } // Do post action for open tag. String addedContents = postActionAfterSerializeOpenTag(element, param); // Complete the open tag for element when it has child/children. if (element->hasChildren() || param->haveAddedContentsBeforeEnd) result.append('>'); // Append the added contents generate in post action of open tag. result.append(addedContents); // Save the result to data buffer. saveHTMLContentToBuffer(result.toString(), param); } // Serialize end tag of an specified element. void WebPageSerializerImpl::endTagToString(Element* element, SerializeDomParam* param) { bool needSkip; StringBuilder result; // Do pre action for end tag. result.append(preActionBeforeSerializeEndTag(element, param, &needSkip)); if (needSkip) return; // Write end tag when element has child/children. if (element->hasChildren() || param->haveAddedContentsBeforeEnd) { result.appendLiteral("nodeName().lower()); result.append('>'); } else { // Check whether we have to write end tag for empty element. if (param->isHTMLDocument) { result.append('>'); // FIXME: This code is horribly wrong. WebPageSerializerImpl must die. if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) { // We need to write end tag when it is required. result.appendLiteral("nodeName().lower()); result.append('>'); } } else { // For xml base document. result.appendLiteral(" />"); } } // Do post action for end tag. result.append(postActionAfterSerializeEndTag(element, param)); // Save the result to data buffer. saveHTMLContentToBuffer(result.toString(), param); } void WebPageSerializerImpl::buildContentForNode(Node* node, SerializeDomParam* param) { switch (node->nodeType()) { case Node::ELEMENT_NODE: // Process open tag of element. openTagToString(toElement(node), param); // Walk through the children nodes and process it. for (Node *child = node->firstChild(); child; child = child->nextSibling()) buildContentForNode(child, param); // Process end tag of element. endTagToString(toElement(node), param); break; case Node::TEXT_NODE: saveHTMLContentToBuffer(createMarkup(node), param); break; case Node::ATTRIBUTE_NODE: case Node::DOCUMENT_NODE: case Node::DOCUMENT_FRAGMENT_NODE: // Should not exist. ASSERT_NOT_REACHED(); break; // Document type node can be in DOM? case Node::DOCUMENT_TYPE_NODE: param->haveSeenDocType = true; default: // For other type node, call default action. saveHTMLContentToBuffer(createMarkup(node), param); break; } } WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, bool recursiveSerialization, WebPageSerializerClient* client, const WebVector& links, const WebVector& localPaths, const WebString& localDirectoryName) : m_client(client) , m_recursiveSerialization(recursiveSerialization) , m_framesCollected(false) , m_localDirectoryName(localDirectoryName) , m_htmlEntities(false) , m_xmlEntities(true) { // Must specify available webframe. ASSERT(frame); m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame); // Make sure we have non 0 client. ASSERT(client); // Build local resources map. ASSERT(links.size() == localPaths.size()); for (size_t i = 0; i < links.size(); i++) { KURL url = links[i]; ASSERT(!m_localLinks.contains(url.string())); m_localLinks.set(url.string(), localPaths[i]); } ASSERT(m_dataBuffer.isEmpty()); } void WebPageSerializerImpl::collectTargetFrames() { ASSERT(!m_framesCollected); m_framesCollected = true; // First, process main frame. m_frames.append(m_specifiedWebLocalFrameImpl); // Return now if user only needs to serialize specified frame, not including // all sub-frames. if (!m_recursiveSerialization) return; // Collect all frames inside the specified frame. for (int i = 0; i < static_cast(m_frames.size()); ++i) { WebLocalFrameImpl* currentFrame = m_frames[i]; // Get current using document. Document* currentDoc = currentFrame->frame()->document(); // Go through sub-frames. RefPtrWillBeRawPtr all = currentDoc->all(); for (unsigned i = 0; Element* element = all->item(i); ++i) { if (!element->isHTMLElement()) continue; WebLocalFrameImpl* webFrame = WebLocalFrameImpl::fromFrameOwnerElement(element); if (webFrame) m_frames.append(webFrame); } } } bool WebPageSerializerImpl::serialize() { if (!m_framesCollected) collectTargetFrames(); bool didSerialization = false; KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url(); for (unsigned i = 0; i < m_frames.size(); ++i) { WebLocalFrameImpl* webFrame = m_frames[i]; Document* document = webFrame->frame()->document(); const KURL& url = document->url(); if (!url.isValid() || !m_localLinks.contains(url.string())) continue; didSerialization = true; const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding(); String directoryName = url == mainURL ? m_localDirectoryName : ""; SerializeDomParam param(url, textEncoding, document, directoryName); Element* documentElement = document->documentElement(); if (documentElement) buildContentForNode(documentElement, ¶m); encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush); } ASSERT(m_dataBuffer.isEmpty()); m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished); return didSerialization; } } // namespace blink