• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 // How we handle the base tag better.
32 // Current status:
33 // At now the normal way we use to handling base tag is
34 // a) For those links which have corresponding local saved files, such as
35 // savable CSS, JavaScript files, they will be written to relative URLs which
36 // point to local saved file. Why those links can not be resolved as absolute
37 // file URLs, because if they are resolved as absolute URLs, after moving the
38 // file location from one directory to another directory, the file URLs will
39 // be dead links.
40 // b) For those links which have not corresponding local saved files, such as
41 // links in A, AREA tags, they will be resolved as absolute URLs.
42 // c) We comment all base tags when serialzing DOM for the page.
43 // FireFox also uses above way to handle base tag.
44 //
45 // Problem:
46 // This way can not handle the following situation:
47 // the base tag is written by JavaScript.
48 // For example. The page "www.yahoo.com" use
49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50 // of page when loading page. So when saving page as completed-HTML, we assume
51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52 // completed-HTML page, then the JavaScript will insert a base tag
53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54 // local saved resource files will be resolved as
55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
56 // files can not be loaded correctly. Also the page will be rendered ugly since
57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58 // files can not be fetched.
59 // Now FireFox, IE and WebKit based Browser all have this problem.
60 //
61 // Solution:
62 // My solution is that we comment old base tag and write new base tag:
63 // <base href="." ...> after the previous commented base tag. In WebKit, it
64 // always uses the latest "href" attribute of base tag to set document's base
65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
66 // write a new base tag <base href="."> after the previous commented base tag.
67 // The new added base tag can help engine to locate correct base URL for
68 // correctly loading local saved resource files. Also I think we need to inherit
69 // the base target value from document object when appending new base tag.
70 // If there are multiple base tags in original document, we will comment all old
71 // base tags and append new base tag after each old base tag because we do not
72 // know those old base tags are original content or added by JavaScript. If
73 // they are added by JavaScript, it means when loading saved page, the script(s)
74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
75 // override the incorrect base URL and make sure we alway load correct local
76 // saved resource files.
77 
78 #include "config.h"
79 #include "web/WebPageSerializerImpl.h"
80 
81 #include "core/HTMLNames.h"
82 #include "core/dom/Document.h"
83 #include "core/dom/DocumentType.h"
84 #include "core/dom/Element.h"
85 #include "core/editing/markup.h"
86 #include "core/html/HTMLAllCollection.h"
87 #include "core/html/HTMLElement.h"
88 #include "core/html/HTMLFormElement.h"
89 #include "core/html/HTMLHtmlElement.h"
90 #include "core/html/HTMLMetaElement.h"
91 #include "core/loader/DocumentLoader.h"
92 #include "core/loader/FrameLoader.h"
93 #include "public/platform/WebVector.h"
94 #include "web/WebLocalFrameImpl.h"
95 #include "wtf/text/TextEncoding.h"
96 
97 using namespace WebCore;
98 
99 namespace blink {
100 
101 // Maximum length of data buffer which is used to temporary save generated
102 // html content data. This is a soft limit which might be passed if a very large
103 // contegious string is found in the page.
104 static const unsigned dataBufferCapacity = 65536;
105 
SerializeDomParam(const KURL & url,const WTF::TextEncoding & textEncoding,Document * document,const String & directoryName)106 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
107                                                             const WTF::TextEncoding& textEncoding,
108                                                             Document* document,
109                                                             const String& directoryName)
110     : url(url)
111     , textEncoding(textEncoding)
112     , document(document)
113     , directoryName(directoryName)
114     , isHTMLDocument(document->isHTMLDocument())
115     , haveSeenDocType(false)
116     , haveAddedCharsetDeclaration(false)
117     , skipMetaElement(0)
118     , isInScriptOrStyleTag(false)
119     , haveAddedXMLProcessingDirective(false)
120     , haveAddedContentsBeforeEnd(false)
121 {
122 }
123 
preActionBeforeSerializeOpenTag(const Element * element,SerializeDomParam * param,bool * needSkip)124 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
125     const Element* element, SerializeDomParam* param, bool* needSkip)
126 {
127     StringBuilder result;
128 
129     *needSkip = false;
130     if (param->isHTMLDocument) {
131         // Skip the open tag of original META tag which declare charset since we
132         // have overrided the META which have correct charset declaration after
133         // serializing open tag of HEAD element.
134         ASSERT(element);
135         if (isHTMLMetaElement(*element)) {
136             const HTMLMetaElement& meta = toHTMLMetaElement(*element);
137             // Check whether the META tag has declared charset or not.
138             String equiv = meta.httpEquiv();
139             if (equalIgnoringCase(equiv, "content-type")) {
140                 String content = meta.content();
141                 if (content.length() && content.contains("charset", false)) {
142                     // Find META tag declared charset, we need to skip it when
143                     // serializing DOM.
144                     param->skipMetaElement = element;
145                     *needSkip = true;
146                 }
147             }
148         } else if (isHTMLHtmlElement(*element)) {
149             // Check something before processing the open tag of HEAD element.
150             // First we add doc type declaration if original document has it.
151             if (!param->haveSeenDocType) {
152                 param->haveSeenDocType = true;
153                 result.append(createMarkup(param->document->doctype()));
154             }
155 
156             // Add MOTW declaration before html tag.
157             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
158             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
159         } else if (isHTMLBaseElement(*element)) {
160             // Comment the BASE tag when serializing dom.
161             result.append("<!--");
162         }
163     } else {
164         // Write XML declaration.
165         if (!param->haveAddedXMLProcessingDirective) {
166             param->haveAddedXMLProcessingDirective = true;
167             // Get encoding info.
168             String xmlEncoding = param->document->xmlEncoding();
169             if (xmlEncoding.isEmpty())
170                 xmlEncoding = param->document->encodingName();
171             if (xmlEncoding.isEmpty())
172                 xmlEncoding = UTF8Encoding().name();
173             result.append("<?xml version=\"");
174             result.append(param->document->xmlVersion());
175             result.append("\" encoding=\"");
176             result.append(xmlEncoding);
177             if (param->document->xmlStandalone())
178                 result.append("\" standalone=\"yes");
179             result.append("\"?>\n");
180         }
181         // Add doc type declaration if original document has it.
182         if (!param->haveSeenDocType) {
183             param->haveSeenDocType = true;
184             result.append(createMarkup(param->document->doctype()));
185         }
186     }
187     return result.toString();
188 }
189 
postActionAfterSerializeOpenTag(const Element * element,SerializeDomParam * param)190 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
191     const Element* element, SerializeDomParam* param)
192 {
193     StringBuilder result;
194 
195     param->haveAddedContentsBeforeEnd = false;
196     if (!param->isHTMLDocument)
197         return result.toString();
198     // Check after processing the open tag of HEAD element
199     if (!param->haveAddedCharsetDeclaration
200         && isHTMLHeadElement(*element)) {
201         param->haveAddedCharsetDeclaration = true;
202         // Check meta element. WebKit only pre-parse the first 512 bytes
203         // of the document. If the whole <HEAD> is larger and meta is the
204         // end of head part, then this kind of pages aren't decoded correctly
205         // because of this issue. So when we serialize the DOM, we need to
206         // make sure the meta will in first child of head tag.
207         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
208         // First we generate new content for writing correct META element.
209         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
210             String(param->textEncoding.name())));
211 
212         param->haveAddedContentsBeforeEnd = true;
213         // Will search each META which has charset declaration, and skip them all
214         // in PreActionBeforeSerializeOpenTag.
215     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
216         param->isInScriptOrStyleTag = true;
217     }
218 
219     return result.toString();
220 }
221 
preActionBeforeSerializeEndTag(const Element * element,SerializeDomParam * param,bool * needSkip)222 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
223     const Element* element, SerializeDomParam* param, bool* needSkip)
224 {
225     String result;
226 
227     *needSkip = false;
228     if (!param->isHTMLDocument)
229         return result;
230     // Skip the end tag of original META tag which declare charset.
231     // Need not to check whether it's META tag since we guarantee
232     // skipMetaElement is definitely META tag if it's not 0.
233     if (param->skipMetaElement == element) {
234         *needSkip = true;
235     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
236         ASSERT(param->isInScriptOrStyleTag);
237         param->isInScriptOrStyleTag = false;
238     }
239 
240     return result;
241 }
242 
243 // After we finish serializing end tag of a element, we give the target
244 // element a chance to do some post work to add some additional data.
postActionAfterSerializeEndTag(const Element * element,SerializeDomParam * param)245 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
246     const Element* element, SerializeDomParam* param)
247 {
248     StringBuilder result;
249 
250     if (!param->isHTMLDocument)
251         return result.toString();
252     // Comment the BASE tag when serializing DOM.
253     if (isHTMLBaseElement(*element)) {
254         result.append("-->");
255         // Append a new base tag declaration.
256         result.append(WebPageSerializer::generateBaseTagDeclaration(
257             param->document->baseTarget()));
258     }
259 
260     return result.toString();
261 }
262 
saveHTMLContentToBuffer(const String & result,SerializeDomParam * param)263 void WebPageSerializerImpl::saveHTMLContentToBuffer(
264     const String& result, SerializeDomParam* param)
265 {
266     m_dataBuffer.append(result);
267     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
268                          param,
269                          DoNotForceFlush);
270 }
271 
encodeAndFlushBuffer(WebPageSerializerClient::PageSerializationStatus status,SerializeDomParam * param,FlushOption flushOption)272 void WebPageSerializerImpl::encodeAndFlushBuffer(
273     WebPageSerializerClient::PageSerializationStatus status,
274     SerializeDomParam* param,
275     FlushOption flushOption)
276 {
277     // Data buffer is not full nor do we want to force flush.
278     if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
279         return;
280 
281     String content = m_dataBuffer.toString();
282     m_dataBuffer.clear();
283 
284     CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
285 
286     // Send result to the client.
287     m_client->didSerializeDataForFrame(param->url,
288                                        WebCString(encodedContent.data(), encodedContent.length()),
289                                        status);
290 }
291 
openTagToString(Element * element,SerializeDomParam * param)292 void WebPageSerializerImpl::openTagToString(Element* element,
293                                             SerializeDomParam* param)
294 {
295     bool needSkip;
296     StringBuilder result;
297     // Do pre action for open tag.
298     result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
299     if (needSkip)
300         return;
301     // Add open tag
302     result.append('<');
303     result.append(element->nodeName().lower());
304     // Go through all attributes and serialize them.
305     if (element->hasAttributes()) {
306         AttributeCollection attributes = element->attributes();
307         AttributeCollection::const_iterator end = attributes.end();
308         for (AttributeCollection::const_iterator it = attributes.begin(); it != end; ++it) {
309             result.append(' ');
310             // Add attribute pair
311             result.append(it->name().toString());
312             result.appendLiteral("=\"");
313             if (!it->value().isEmpty()) {
314                 const String& attrValue = it->value();
315 
316                 // Check whether we need to replace some resource links
317                 // with local resource paths.
318                 const QualifiedName& attrName = it->name();
319                 if (element->hasLegalLinkAttribute(attrName)) {
320                     // For links start with "javascript:", we do not change it.
321                     if (attrValue.startsWith("javascript:", false))
322                         result.append(attrValue);
323                     else {
324                         // Get the absolute link
325                         WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
326                         String completeURL = subFrame ? subFrame->frame()->document()->url() :
327                                                         param->document->completeURL(attrValue);
328                         // Check whether we have local files for those link.
329                         if (m_localLinks.contains(completeURL)) {
330                             if (!param->directoryName.isEmpty()) {
331                                 result.appendLiteral("./");
332                                 result.append(param->directoryName);
333                                 result.append('/');
334                             }
335                             result.append(m_localLinks.get(completeURL));
336                         } else
337                             result.append(completeURL);
338                     }
339                 } else {
340                     if (param->isHTMLDocument)
341                         result.append(m_htmlEntities.convertEntitiesInString(attrValue));
342                     else
343                         result.append(m_xmlEntities.convertEntitiesInString(attrValue));
344                 }
345             }
346             result.append('\"');
347         }
348     }
349 
350     // Do post action for open tag.
351     String addedContents = postActionAfterSerializeOpenTag(element, param);
352     // Complete the open tag for element when it has child/children.
353     if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
354         result.append('>');
355     // Append the added contents generate in  post action of open tag.
356     result.append(addedContents);
357     // Save the result to data buffer.
358     saveHTMLContentToBuffer(result.toString(), param);
359 }
360 
361 // Serialize end tag of an specified element.
endTagToString(Element * element,SerializeDomParam * param)362 void WebPageSerializerImpl::endTagToString(Element* element,
363                                            SerializeDomParam* param)
364 {
365     bool needSkip;
366     StringBuilder result;
367     // Do pre action for end tag.
368     result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
369     if (needSkip)
370         return;
371     // Write end tag when element has child/children.
372     if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
373         result.appendLiteral("</");
374         result.append(element->nodeName().lower());
375         result.append('>');
376     } else {
377         // Check whether we have to write end tag for empty element.
378         if (param->isHTMLDocument) {
379             result.append('>');
380             // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
381             if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
382                 // We need to write end tag when it is required.
383                 result.appendLiteral("</");
384                 result.append(element->nodeName().lower());
385                 result.append('>');
386             }
387         } else {
388             // For xml base document.
389             result.appendLiteral(" />");
390         }
391     }
392     // Do post action for end tag.
393     result.append(postActionAfterSerializeEndTag(element, param));
394     // Save the result to data buffer.
395     saveHTMLContentToBuffer(result.toString(), param);
396 }
397 
buildContentForNode(Node * node,SerializeDomParam * param)398 void WebPageSerializerImpl::buildContentForNode(Node* node,
399                                                 SerializeDomParam* param)
400 {
401     switch (node->nodeType()) {
402     case Node::ELEMENT_NODE:
403         // Process open tag of element.
404         openTagToString(toElement(node), param);
405         // Walk through the children nodes and process it.
406         for (Node *child = node->firstChild(); child; child = child->nextSibling())
407             buildContentForNode(child, param);
408         // Process end tag of element.
409         endTagToString(toElement(node), param);
410         break;
411     case Node::TEXT_NODE:
412         saveHTMLContentToBuffer(createMarkup(node), param);
413         break;
414     case Node::ATTRIBUTE_NODE:
415     case Node::DOCUMENT_NODE:
416     case Node::DOCUMENT_FRAGMENT_NODE:
417         // Should not exist.
418         ASSERT_NOT_REACHED();
419         break;
420     // Document type node can be in DOM?
421     case Node::DOCUMENT_TYPE_NODE:
422         param->haveSeenDocType = true;
423     default:
424         // For other type node, call default action.
425         saveHTMLContentToBuffer(createMarkup(node), param);
426         break;
427     }
428 }
429 
WebPageSerializerImpl(WebFrame * frame,bool recursiveSerialization,WebPageSerializerClient * client,const WebVector<WebURL> & links,const WebVector<WebString> & localPaths,const WebString & localDirectoryName)430 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
431                                              bool recursiveSerialization,
432                                              WebPageSerializerClient* client,
433                                              const WebVector<WebURL>& links,
434                                              const WebVector<WebString>& localPaths,
435                                              const WebString& localDirectoryName)
436     : m_client(client)
437     , m_recursiveSerialization(recursiveSerialization)
438     , m_framesCollected(false)
439     , m_localDirectoryName(localDirectoryName)
440     , m_htmlEntities(false)
441     , m_xmlEntities(true)
442 {
443     // Must specify available webframe.
444     ASSERT(frame);
445     m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
446     // Make sure we have non 0 client.
447     ASSERT(client);
448     // Build local resources map.
449     ASSERT(links.size() == localPaths.size());
450     for (size_t i = 0; i < links.size(); i++) {
451         KURL url = links[i];
452         ASSERT(!m_localLinks.contains(url.string()));
453         m_localLinks.set(url.string(), localPaths[i]);
454     }
455 
456     ASSERT(m_dataBuffer.isEmpty());
457 }
458 
collectTargetFrames()459 void WebPageSerializerImpl::collectTargetFrames()
460 {
461     ASSERT(!m_framesCollected);
462     m_framesCollected = true;
463 
464     // First, process main frame.
465     m_frames.append(m_specifiedWebLocalFrameImpl);
466     // Return now if user only needs to serialize specified frame, not including
467     // all sub-frames.
468     if (!m_recursiveSerialization)
469         return;
470     // Collect all frames inside the specified frame.
471     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
472         WebLocalFrameImpl* currentFrame = m_frames[i];
473         // Get current using document.
474         Document* currentDoc = currentFrame->frame()->document();
475         // Go through sub-frames.
476         RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();
477 
478         for (unsigned i = 0; Element* element = all->item(i); ++i) {
479             if (!element->isHTMLElement())
480                 continue;
481             WebLocalFrameImpl* webFrame =
482                 WebLocalFrameImpl::fromFrameOwnerElement(element);
483             if (webFrame)
484                 m_frames.append(webFrame);
485         }
486     }
487 }
488 
serialize()489 bool WebPageSerializerImpl::serialize()
490 {
491     if (!m_framesCollected)
492         collectTargetFrames();
493 
494     bool didSerialization = false;
495     KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();
496 
497     for (unsigned i = 0; i < m_frames.size(); ++i) {
498         WebLocalFrameImpl* webFrame = m_frames[i];
499         Document* document = webFrame->frame()->document();
500         const KURL& url = document->url();
501 
502         if (!url.isValid() || !m_localLinks.contains(url.string()))
503             continue;
504 
505         didSerialization = true;
506 
507         const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
508         String directoryName = url == mainURL ? m_localDirectoryName : "";
509 
510         SerializeDomParam param(url, textEncoding, document, directoryName);
511 
512         Element* documentElement = document->documentElement();
513         if (documentElement)
514             buildContentForNode(documentElement, &param);
515 
516         encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
517     }
518 
519     ASSERT(m_dataBuffer.isEmpty());
520     m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
521     return didSerialization;
522 }
523 
524 }  // namespace blink
525