• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 // How we handle the base tag better.
32 // Current status:
33 // At now the normal way we use to handling base tag is
34 // a) For those links which have corresponding local saved files, such as
35 // savable CSS, JavaScript files, they will be written to relative URLs which
36 // point to local saved file. Why those links can not be resolved as absolute
37 // file URLs, because if they are resolved as absolute URLs, after moving the
38 // file location from one directory to another directory, the file URLs will
39 // be dead links.
40 // b) For those links which have not corresponding local saved files, such as
41 // links in A, AREA tags, they will be resolved as absolute URLs.
42 // c) We comment all base tags when serialzing DOM for the page.
43 // FireFox also uses above way to handle base tag.
44 //
45 // Problem:
46 // This way can not handle the following situation:
47 // the base tag is written by JavaScript.
48 // For example. The page "www.yahoo.com" use
49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50 // of page when loading page. So when saving page as completed-HTML, we assume
51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52 // completed-HTML page, then the JavaScript will insert a base tag
53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54 // local saved resource files will be resolved as
55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
56 // files can not be loaded correctly. Also the page will be rendered ugly since
57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58 // files can not be fetched.
59 // Now FireFox, IE and WebKit based Browser all have this problem.
60 //
61 // Solution:
62 // My solution is that we comment old base tag and write new base tag:
63 // <base href="." ...> after the previous commented base tag. In WebKit, it
64 // always uses the latest "href" attribute of base tag to set document's base
65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
66 // write a new base tag <base href="."> after the previous commented base tag.
67 // The new added base tag can help engine to locate correct base URL for
68 // correctly loading local saved resource files. Also I think we need to inherit
69 // the base target value from document object when appending new base tag.
70 // If there are multiple base tags in original document, we will comment all old
71 // base tags and append new base tag after each old base tag because we do not
72 // know those old base tags are original content or added by JavaScript. If
73 // they are added by JavaScript, it means when loading saved page, the script(s)
74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
75 // override the incorrect base URL and make sure we alway load correct local
76 // saved resource files.
77 
78 #include "config.h"
79 #include "WebPageSerializerImpl.h"
80 
81 #include "Document.h"
82 #include "DocumentType.h"
83 #include "Element.h"
84 #include "FrameLoader.h"
85 #include "HTMLAllCollection.h"
86 #include "HTMLElement.h"
87 #include "HTMLFormElement.h"
88 #include "HTMLMetaElement.h"
89 #include "HTMLNames.h"
90 #include "KURL.h"
91 #include "PlatformString.h"
92 #include "StringBuilder.h"
93 #include "TextEncoding.h"
94 #include "markup.h"
95 
96 #include "DOMUtilitiesPrivate.h"
97 #include "WebFrameImpl.h"
98 #include "WebURL.h"
99 #include "WebVector.h"
100 
101 using namespace WebCore;
102 
103 namespace WebKit {
104 
105 // Maximum length of data buffer which is used to temporary save generated
106 // html content data. This is a soft limit which might be passed if a very large
107 // contegious string is found in the page.
108 static const unsigned dataBufferCapacity = 65536;
109 
SerializeDomParam(const KURL & currentFrameURL,const TextEncoding & textEncoding,Document * doc,const String & directoryName)110 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& currentFrameURL,
111                                                             const TextEncoding& textEncoding,
112                                                             Document* doc,
113                                                             const String& directoryName)
114     : currentFrameURL(currentFrameURL)
115     , textEncoding(textEncoding)
116     , doc(doc)
117     , directoryName(directoryName)
118     , hasDoctype(false)
119     , hasCheckedMeta(false)
120     , skipMetaElement(0)
121     , isInScriptOrStyleTag(false)
122     , hasDocDeclaration(false)
123 {
124     // Cache the value since we check it lots of times.
125     isHTMLDocument = doc->isHTMLDocument();
126 }
127 
preActionBeforeSerializeOpenTag(const Element * element,SerializeDomParam * param,bool * needSkip)128 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
129     const Element* element, SerializeDomParam* param, bool* needSkip)
130 {
131     StringBuilder result;
132 
133     *needSkip = false;
134     if (param->isHTMLDocument) {
135         // Skip the open tag of original META tag which declare charset since we
136         // have overrided the META which have correct charset declaration after
137         // serializing open tag of HEAD element.
138         if (element->hasTagName(HTMLNames::metaTag)) {
139             const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element);
140             // Check whether the META tag has declared charset or not.
141             String equiv = meta->httpEquiv();
142             if (equalIgnoringCase(equiv, "content-type")) {
143                 String content = meta->content();
144                 if (content.length() && content.contains("charset", false)) {
145                     // Find META tag declared charset, we need to skip it when
146                     // serializing DOM.
147                     param->skipMetaElement = element;
148                     *needSkip = true;
149                 }
150             }
151         } else if (element->hasTagName(HTMLNames::htmlTag)) {
152             // Check something before processing the open tag of HEAD element.
153             // First we add doc type declaration if original doc has it.
154             if (!param->hasDoctype) {
155                 param->hasDoctype = true;
156                 result.append(createMarkup(param->doc->doctype()));
157             }
158 
159             // Add MOTW declaration before html tag.
160             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
161             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->currentFrameURL));
162         } else if (element->hasTagName(HTMLNames::baseTag)) {
163             // Comment the BASE tag when serializing dom.
164             result.append("<!--");
165         }
166     } else {
167         // Write XML declaration.
168         if (!param->hasDocDeclaration) {
169             param->hasDocDeclaration = true;
170             // Get encoding info.
171             String xmlEncoding = param->doc->xmlEncoding();
172             if (xmlEncoding.isEmpty())
173                 xmlEncoding = param->doc->frame()->loader()->encoding();
174             if (xmlEncoding.isEmpty())
175                 xmlEncoding = UTF8Encoding().name();
176             result.append("<?xml version=\"");
177             result.append(param->doc->xmlVersion());
178             result.append("\" encoding=\"");
179             result.append(xmlEncoding);
180             if (param->doc->xmlStandalone())
181                 result.append("\" standalone=\"yes");
182             result.append("\"?>\n");
183         }
184         // Add doc type declaration if original doc has it.
185         if (!param->hasDoctype) {
186             param->hasDoctype = true;
187             result.append(createMarkup(param->doc->doctype()));
188         }
189     }
190     return result.toString();
191 }
192 
postActionAfterSerializeOpenTag(const Element * element,SerializeDomParam * param)193 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
194     const Element* element, SerializeDomParam* param)
195 {
196     StringBuilder result;
197 
198     param->hasAddedContentsBeforeEnd = false;
199     if (!param->isHTMLDocument)
200         return result.toString();
201     // Check after processing the open tag of HEAD element
202     if (!param->hasCheckedMeta
203         && element->hasTagName(HTMLNames::headTag)) {
204         param->hasCheckedMeta = true;
205         // Check meta element. WebKit only pre-parse the first 512 bytes
206         // of the document. If the whole <HEAD> is larger and meta is the
207         // end of head part, then this kind of pages aren't decoded correctly
208         // because of this issue. So when we serialize the DOM, we need to
209         // make sure the meta will in first child of head tag.
210         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
211         // First we generate new content for writing correct META element.
212         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
213             String(param->textEncoding.name())));
214 
215         param->hasAddedContentsBeforeEnd = true;
216         // Will search each META which has charset declaration, and skip them all
217         // in PreActionBeforeSerializeOpenTag.
218     } else if (element->hasTagName(HTMLNames::scriptTag)
219                || element->hasTagName(HTMLNames::styleTag)) {
220         param->isInScriptOrStyleTag = true;
221     }
222 
223     return result.toString();
224 }
225 
preActionBeforeSerializeEndTag(const Element * element,SerializeDomParam * param,bool * needSkip)226 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
227     const Element* element, SerializeDomParam* param, bool* needSkip)
228 {
229     String result;
230 
231     *needSkip = false;
232     if (!param->isHTMLDocument)
233         return result;
234     // Skip the end tag of original META tag which declare charset.
235     // Need not to check whether it's META tag since we guarantee
236     // skipMetaElement is definitely META tag if it's not 0.
237     if (param->skipMetaElement == element)
238         *needSkip = true;
239     else if (element->hasTagName(HTMLNames::scriptTag)
240              || element->hasTagName(HTMLNames::styleTag)) {
241         ASSERT(param->isInScriptOrStyleTag);
242         param->isInScriptOrStyleTag = false;
243     }
244 
245     return result;
246 }
247 
248 // After we finish serializing end tag of a element, we give the target
249 // element a chance to do some post work to add some additional data.
postActionAfterSerializeEndTag(const Element * element,SerializeDomParam * param)250 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
251     const Element* element, SerializeDomParam* param)
252 {
253     StringBuilder result;
254 
255     if (!param->isHTMLDocument)
256         return result.toString();
257     // Comment the BASE tag when serializing DOM.
258     if (element->hasTagName(HTMLNames::baseTag)) {
259         result.append("-->");
260         // Append a new base tag declaration.
261         result.append(WebPageSerializer::generateBaseTagDeclaration(
262             param->doc->baseTarget()));
263     }
264 
265     return result.toString();
266 }
267 
saveHTMLContentToBuffer(const String & result,SerializeDomParam * param)268 void WebPageSerializerImpl::saveHTMLContentToBuffer(
269     const String& result, SerializeDomParam* param)
270 {
271     m_dataBuffer.append(result);
272     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
273                          param,
274                          0);
275 }
276 
encodeAndFlushBuffer(WebPageSerializerClient::PageSerializationStatus status,SerializeDomParam * param,bool force)277 void WebPageSerializerImpl::encodeAndFlushBuffer(
278     WebPageSerializerClient::PageSerializationStatus status,
279     SerializeDomParam* param,
280     bool force)
281 {
282     // Data buffer is not full nor do we want to force flush.
283     if (!force && m_dataBuffer.length() <= dataBufferCapacity)
284         return;
285 
286     String content = m_dataBuffer.toString();
287     m_dataBuffer.clear();
288 
289     // Convert the unicode content to target encoding
290     CString encodedContent = param->textEncoding.encode(
291         content.characters(), content.length(), EntitiesForUnencodables);
292 
293     // Send result to the client.
294     m_client->didSerializeDataForFrame(param->currentFrameURL,
295                                        WebCString(encodedContent.data(), encodedContent.length()),
296                                        status);
297 }
298 
openTagToString(const Element * element,SerializeDomParam * param)299 void WebPageSerializerImpl::openTagToString(const Element* element,
300                                             SerializeDomParam* param)
301 {
302     // FIXME: use StringBuilder instead of String.
303     bool needSkip;
304     // Do pre action for open tag.
305     String result = preActionBeforeSerializeOpenTag(element, param, &needSkip);
306     if (needSkip)
307         return;
308     // Add open tag
309     result += "<" + element->nodeName();
310     // Go through all attributes and serialize them.
311     const NamedNodeMap *attrMap = element->attributes(true);
312     if (attrMap) {
313         unsigned numAttrs = attrMap->length();
314         for (unsigned i = 0; i < numAttrs; i++) {
315             result += " ";
316             // Add attribute pair
317             const Attribute *attribute = attrMap->attributeItem(i);
318             result += attribute->name().toString();
319             result += "=\"";
320             if (!attribute->value().isEmpty()) {
321                 const String& attrValue = attribute->value();
322 
323                 // Check whether we need to replace some resource links
324                 // with local resource paths.
325                 const QualifiedName& attrName = attribute->name();
326                 if (elementHasLegalLinkAttribute(element, attrName)) {
327                     // For links start with "javascript:", we do not change it.
328                     if (attrValue.startsWith("javascript:", false))
329                         result += attrValue;
330                     else {
331                         // Get the absolute link
332                         String completeURL = param->doc->completeURL(attrValue);
333                         // Check whether we have local files for those link.
334                         if (m_localLinks.contains(completeURL)) {
335                             if (!m_localDirectoryName.isEmpty())
336                                 result += "./" + m_localDirectoryName + "/";
337                             result += m_localLinks.get(completeURL);
338                         } else
339                             result += completeURL;
340                     }
341                 } else {
342                     if (param->isHTMLDocument)
343                         result += m_htmlEntities.convertEntitiesInString(attrValue);
344                     else
345                         result += m_xmlEntities.convertEntitiesInString(attrValue);
346                 }
347             }
348             result += "\"";
349         }
350     }
351 
352     // Do post action for open tag.
353     String addedContents = postActionAfterSerializeOpenTag(element, param);
354     // Complete the open tag for element when it has child/children.
355     if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd)
356         result += ">";
357     // Append the added contents generate in  post action of open tag.
358     result += addedContents;
359     // Save the result to data buffer.
360     saveHTMLContentToBuffer(result, param);
361 }
362 
363 // Serialize end tag of an specified element.
endTagToString(const Element * element,SerializeDomParam * param)364 void WebPageSerializerImpl::endTagToString(const Element* element,
365                                            SerializeDomParam* param)
366 {
367     bool needSkip;
368     // Do pre action for end tag.
369     String result = preActionBeforeSerializeEndTag(element,
370                                                    param,
371                                                    &needSkip);
372     if (needSkip)
373         return;
374     // Write end tag when element has child/children.
375     if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) {
376         result += "</";
377         result += element->nodeName();
378         result += ">";
379     } else {
380         // Check whether we have to write end tag for empty element.
381         if (param->isHTMLDocument) {
382             result += ">";
383             const HTMLElement* htmlElement =
384             static_cast<const HTMLElement*>(element);
385             if (htmlElement->endTagRequirement() == TagStatusRequired) {
386                 // We need to write end tag when it is required.
387                 result += "</";
388                 result += element->nodeName();
389                 result += ">";
390             }
391         } else {
392             // For xml base document.
393             result += " />";
394         }
395     }
396     // Do post action for end tag.
397     result += postActionAfterSerializeEndTag(element, param);
398     // Save the result to data buffer.
399     saveHTMLContentToBuffer(result, param);
400 }
401 
buildContentForNode(const Node * node,SerializeDomParam * param)402 void WebPageSerializerImpl::buildContentForNode(const Node* node,
403                                                 SerializeDomParam* param)
404 {
405     switch (node->nodeType()) {
406     case Node::ELEMENT_NODE:
407         // Process open tag of element.
408         openTagToString(static_cast<const Element*>(node), param);
409         // Walk through the children nodes and process it.
410         for (const Node *child = node->firstChild(); child; child = child->nextSibling())
411             buildContentForNode(child, param);
412         // Process end tag of element.
413         endTagToString(static_cast<const Element*>(node), param);
414         break;
415     case Node::TEXT_NODE:
416         saveHTMLContentToBuffer(createMarkup(node), param);
417         break;
418     case Node::ATTRIBUTE_NODE:
419     case Node::DOCUMENT_NODE:
420     case Node::DOCUMENT_FRAGMENT_NODE:
421         // Should not exist.
422         ASSERT_NOT_REACHED();
423         break;
424     // Document type node can be in DOM?
425     case Node::DOCUMENT_TYPE_NODE:
426         param->hasDoctype = true;
427     default:
428         // For other type node, call default action.
429         saveHTMLContentToBuffer(createMarkup(node), param);
430         break;
431     }
432 }
433 
WebPageSerializerImpl(WebFrame * frame,bool recursiveSerialization,WebPageSerializerClient * client,const WebVector<WebURL> & links,const WebVector<WebString> & localPaths,const WebString & localDirectoryName)434 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
435                                              bool recursiveSerialization,
436                                              WebPageSerializerClient* client,
437                                              const WebVector<WebURL>& links,
438                                              const WebVector<WebString>& localPaths,
439                                              const WebString& localDirectoryName)
440     : m_client(client)
441     , m_recursiveSerialization(recursiveSerialization)
442     , m_framesCollected(false)
443     , m_localDirectoryName(localDirectoryName)
444     , m_htmlEntities(false)
445     , m_xmlEntities(true)
446 {
447     // Must specify available webframe.
448     ASSERT(frame);
449     m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame);
450     // Make sure we have non 0 client.
451     ASSERT(client);
452     // Build local resources map.
453     ASSERT(links.size() == localPaths.size());
454     for (size_t i = 0; i < links.size(); i++) {
455         KURL url = links[i];
456         ASSERT(!m_localLinks.contains(url.string()));
457         m_localLinks.set(url.string(), localPaths[i]);
458     }
459 
460     ASSERT(!m_dataBuffer.length());
461 }
462 
collectTargetFrames()463 void WebPageSerializerImpl::collectTargetFrames()
464 {
465     ASSERT(!m_framesCollected);
466     m_framesCollected = true;
467 
468     // First, process main frame.
469     m_frames.append(m_specifiedWebFrameImpl);
470     // Return now if user only needs to serialize specified frame, not including
471     // all sub-frames.
472     if (!m_recursiveSerialization)
473         return;
474     // Collect all frames inside the specified frame.
475     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
476         WebFrameImpl* currentFrame = m_frames[i];
477         // Get current using document.
478         Document* currentDoc = currentFrame->frame()->document();
479         // Go through sub-frames.
480         RefPtr<HTMLAllCollection> all = currentDoc->all();
481         for (Node* node = all->firstItem(); node; node = all->nextItem()) {
482             if (!node->isHTMLElement())
483                 continue;
484             Element* element = static_cast<Element*>(node);
485             WebFrameImpl* webFrame =
486                 WebFrameImpl::fromFrameOwnerElement(element);
487             if (webFrame)
488                 m_frames.append(webFrame);
489         }
490     }
491 }
492 
serialize()493 bool WebPageSerializerImpl::serialize()
494 {
495     // Collect target frames.
496     if (!m_framesCollected)
497         collectTargetFrames();
498     bool didSerialization = false;
499     // Get KURL for main frame.
500     KURL mainPageURL = m_specifiedWebFrameImpl->frame()->loader()->url();
501 
502     // Go through all frames for serializing DOM for whole page, include
503     // sub-frames.
504     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
505         // Get current serializing frame.
506         WebFrameImpl* currentFrame = m_frames[i];
507         // Get current using document.
508         Document* currentDoc = currentFrame->frame()->document();
509         // Get current frame's URL.
510         const KURL& currentFrameURL = currentFrame->frame()->loader()->url();
511 
512         // Check whether we have done this document.
513         if (m_localLinks.contains(currentFrameURL.string())) {
514             // A new document, we will serialize it.
515             didSerialization = true;
516             // Get target encoding for current document.
517             String encoding = currentFrame->frame()->loader()->encoding();
518             // Create the text encoding object with target encoding.
519             TextEncoding textEncoding(encoding);
520             // Construct serialize parameter for late processing document.
521             SerializeDomParam param(currentFrameURL,
522                                     encoding.length() ? textEncoding : UTF8Encoding(),
523                                     currentDoc,
524                                     currentFrameURL == mainPageURL ? m_localDirectoryName : "");
525 
526             // Process current document.
527             Element* rootElement = currentDoc->documentElement();
528             if (rootElement)
529                 buildContentForNode(rootElement, &param);
530 
531             // Flush the remainder data and finish serializing current frame.
532             encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished,
533                                  &param,
534                                  1);
535         }
536     }
537 
538     // We have done call frames, so we send message to embedder to tell it that
539     // frames are finished serializing.
540     ASSERT(!m_dataBuffer.length());
541     m_client->didSerializeDataForFrame(KURL(),
542                                        WebCString("", 0),
543                                        WebPageSerializerClient::AllFramesAreFinished);
544     return didSerialization;
545 }
546 
547 }  // namespace WebKit
548