• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "config.h"
32 #include "WebPageSerializer.h"
33 
34 #include "DocumentLoader.h"
35 #include "Element.h"
36 #include "Frame.h"
37 #include "HTMLAllCollection.h"
38 #include "HTMLFrameOwnerElement.h"
39 #include "HTMLInputElement.h"
40 #include "HTMLNames.h"
41 #include "KURL.h"
42 #include "Vector.h"
43 
44 #include "WebCString.h"
45 #include "WebFrame.h"
46 #include "WebFrameImpl.h"
47 #include "WebPageSerializerClient.h"
48 #include "WebPageSerializerImpl.h"
49 #include "WebString.h"
50 #include "WebURL.h"
51 #include "WebVector.h"
52 #include "WebView.h"
53 
54 #include <wtf/text/StringConcatenate.h>
55 
56 using namespace WebCore;
57 
58 namespace {
59 
getSubResourceURLFromElement(Element * element)60 KURL getSubResourceURLFromElement(Element* element)
61 {
62     ASSERT(element);
63     const QualifiedName* attributeName = 0;
64     if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
65         attributeName = &HTMLNames::srcAttr;
66     else if (element->hasTagName(HTMLNames::inputTag)) {
67         HTMLInputElement* input = static_cast<HTMLInputElement*>(element);
68         if (input->isImageButton())
69             attributeName = &HTMLNames::srcAttr;
70     } else if (element->hasTagName(HTMLNames::bodyTag)
71                || element->hasTagName(HTMLNames::tableTag)
72                || element->hasTagName(HTMLNames::trTag)
73                || element->hasTagName(HTMLNames::tdTag))
74         attributeName = &HTMLNames::backgroundAttr;
75     else if (element->hasTagName(HTMLNames::blockquoteTag)
76              || element->hasTagName(HTMLNames::qTag)
77              || element->hasTagName(HTMLNames::delTag)
78              || element->hasTagName(HTMLNames::insTag))
79         attributeName = &HTMLNames::citeAttr;
80     else if (element->hasTagName(HTMLNames::linkTag)) {
81         // If the link element is not css, ignore it.
82         if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
83             // FIXME: Add support for extracting links of sub-resources which
84             // are inside style-sheet such as @import, @font-face, url(), etc.
85             attributeName = &HTMLNames::hrefAttr;
86         }
87     } else if (element->hasTagName(HTMLNames::objectTag))
88         attributeName = &HTMLNames::dataAttr;
89     else if (element->hasTagName(HTMLNames::embedTag))
90         attributeName = &HTMLNames::srcAttr;
91 
92     if (!attributeName)
93         return KURL();
94 
95     String value = element->getAttribute(*attributeName);
96     // Ignore javascript content.
97     if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
98         return KURL();
99 
100     return element->document()->completeURL(value);
101 }
102 
retrieveResourcesForElement(Element * element,Vector<Frame * > * visitedFrames,Vector<Frame * > * framesToVisit,Vector<KURL> * frameURLs,Vector<KURL> * resourceURLs)103 void retrieveResourcesForElement(Element* element,
104                                  Vector<Frame*>* visitedFrames,
105                                  Vector<Frame*>* framesToVisit,
106                                  Vector<KURL>* frameURLs,
107                                  Vector<KURL>* resourceURLs)
108 {
109     // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
110     if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
111         || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
112             && element->isFrameOwnerElement()) {
113         Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame();
114         if (frame) {
115             if (!visitedFrames->contains(frame))
116                 framesToVisit->append(frame);
117             return;
118         }
119     }
120 
121     KURL url = getSubResourceURLFromElement(element);
122     if (url.isEmpty() || !url.isValid())
123         return; // No subresource for this node.
124 
125     // Ignore URLs that have a non-standard protocols. Since the FTP protocol
126     // does no have a cache mechanism, we skip it as well.
127     if (!url.protocolInHTTPFamily() && !url.isLocalFile())
128         return;
129 
130     if (!resourceURLs->contains(url))
131         resourceURLs->append(url);
132 }
133 
retrieveResourcesForFrame(Frame * frame,const WebKit::WebVector<WebKit::WebCString> & supportedSchemes,Vector<Frame * > * visitedFrames,Vector<Frame * > * framesToVisit,Vector<KURL> * frameURLs,Vector<KURL> * resourceURLs)134 void retrieveResourcesForFrame(Frame* frame,
135                                const WebKit::WebVector<WebKit::WebCString>& supportedSchemes,
136                                Vector<Frame*>* visitedFrames,
137                                Vector<Frame*>* framesToVisit,
138                                Vector<KURL>* frameURLs,
139                                Vector<KURL>* resourceURLs)
140 {
141     KURL frameURL = frame->loader()->documentLoader()->request().url();
142 
143     // If the frame's URL is invalid, ignore it, it is not retrievable.
144     if (!frameURL.isValid())
145         return;
146 
147     // Ignore frames from unsupported schemes.
148     bool isValidScheme = false;
149     for (size_t i = 0; i < supportedSchemes.size(); ++i) {
150         if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
151             isValidScheme = true;
152             break;
153         }
154     }
155     if (!isValidScheme)
156         return;
157 
158     // If we have already seen that frame, ignore it.
159     if (visitedFrames->contains(frame))
160         return;
161     visitedFrames->append(frame);
162     if (!frameURLs->contains(frameURL))
163         frameURLs->append(frameURL);
164 
165     // Now get the resources associated with each node of the document.
166     RefPtr<HTMLAllCollection> allNodes = frame->document()->all();
167     for (unsigned i = 0; i < allNodes->length(); ++i) {
168         Node* node = allNodes->item(i);
169         // We are only interested in HTML resources.
170         if (!node->isElementNode())
171             continue;
172         retrieveResourcesForElement(static_cast<Element*>(node),
173                                     visitedFrames, framesToVisit,
174                                     frameURLs, resourceURLs);
175     }
176 }
177 
178 } // namespace
179 
180 namespace WebKit {
181 
serialize(WebFrame * frame,bool recursive,WebPageSerializerClient * client,const WebVector<WebURL> & links,const WebVector<WebString> & localPaths,const WebString & localDirectoryName)182 bool WebPageSerializer::serialize(WebFrame* frame,
183                                   bool recursive,
184                                   WebPageSerializerClient* client,
185                                   const WebVector<WebURL>& links,
186                                   const WebVector<WebString>& localPaths,
187                                   const WebString& localDirectoryName)
188 {
189     WebPageSerializerImpl serializerImpl(
190         frame, recursive, client, links, localPaths, localDirectoryName);
191     return serializerImpl.serialize();
192 }
193 
retrieveAllResources(WebView * view,const WebVector<WebCString> & supportedSchemes,WebVector<WebURL> * resourceURLs,WebVector<WebURL> * frameURLs)194 bool WebPageSerializer::retrieveAllResources(WebView* view,
195                                              const WebVector<WebCString>& supportedSchemes,
196                                              WebVector<WebURL>* resourceURLs,
197                                              WebVector<WebURL>* frameURLs) {
198     WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame());
199     if (!mainFrame)
200         return false;
201 
202     Vector<Frame*> framesToVisit;
203     Vector<Frame*> visitedFrames;
204     Vector<KURL> frameKURLs;
205     Vector<KURL> resourceKURLs;
206 
207     // Let's retrieve the resources from every frame in this page.
208     framesToVisit.append(mainFrame->frame());
209     while (!framesToVisit.isEmpty()) {
210         Frame* frame = framesToVisit[0];
211         framesToVisit.remove(0);
212         retrieveResourcesForFrame(frame, supportedSchemes,
213                                   &visitedFrames, &framesToVisit,
214                                   &frameKURLs, &resourceKURLs);
215     }
216 
217     // Converts the results to WebURLs.
218     WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
219     for (size_t i = 0; i < resourceKURLs.size(); ++i) {
220         resultResourceURLs[i] = resourceKURLs[i];
221         // A frame's src can point to the same URL as another resource, keep the
222         // resource URL only in such cases.
223         size_t index = frameKURLs.find(resourceKURLs[i]);
224         if (index != notFound)
225             frameKURLs.remove(index);
226     }
227     *resourceURLs = resultResourceURLs;
228     WebVector<WebURL> resultFrameURLs(frameKURLs.size());
229     for (size_t i = 0; i < frameKURLs.size(); ++i)
230         resultFrameURLs[i] = frameKURLs[i];
231     *frameURLs = resultFrameURLs;
232 
233     return true;
234 }
235 
generateMetaCharsetDeclaration(const WebString & charset)236 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
237 {
238     return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">");
239 }
240 
generateMarkOfTheWebDeclaration(const WebURL & url)241 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
242 {
243     return String::format("\n<!-- saved from url=(%04d)%s -->\n",
244                           static_cast<int>(url.spec().length()),
245                           url.spec().data());
246 }
247 
generateBaseTagDeclaration(const WebString & baseTarget)248 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
249 {
250     if (baseTarget.isEmpty())
251         return makeString("<base href=\".\">");
252     return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">");
253 }
254 
255 } // namespace WebKit
256