• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "config.h"
32 #include "public/web/WebPageSerializer.h"
33 
34 #include "core/HTMLNames.h"
35 #include "core/dom/Document.h"
36 #include "core/dom/Element.h"
37 #include "core/frame/LocalFrame.h"
38 #include "core/html/HTMLAllCollection.h"
39 #include "core/html/HTMLFrameElementBase.h"
40 #include "core/html/HTMLFrameOwnerElement.h"
41 #include "core/html/HTMLInputElement.h"
42 #include "core/html/HTMLTableElement.h"
43 #include "core/loader/DocumentLoader.h"
44 #include "core/page/Page.h"
45 #include "core/page/PageSerializer.h"
46 #include "platform/SerializedResource.h"
47 #include "platform/mhtml/MHTMLArchive.h"
48 #include "platform/weborigin/KURL.h"
49 #include "public/platform/WebCString.h"
50 #include "public/platform/WebString.h"
51 #include "public/platform/WebURL.h"
52 #include "public/platform/WebVector.h"
53 #include "public/web/WebFrame.h"
54 #include "public/web/WebPageSerializerClient.h"
55 #include "public/web/WebView.h"
56 #include "web/WebLocalFrameImpl.h"
57 #include "web/WebPageSerializerImpl.h"
58 #include "web/WebViewImpl.h"
59 #include "wtf/Vector.h"
60 #include "wtf/text/StringConcatenate.h"
61 
62 namespace blink {
63 
64 namespace {
65 
getSubResourceURLFromElement(Element * element)66 KURL getSubResourceURLFromElement(Element* element)
67 {
68     ASSERT(element);
69     const QualifiedName& attributeName = element->subResourceAttributeName();
70     if (attributeName == QualifiedName::null())
71         return KURL();
72 
73     String value = element->getAttribute(attributeName);
74     // Ignore javascript content.
75     if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
76         return KURL();
77 
78     return element->document().completeURL(value);
79 }
80 
retrieveResourcesForElement(Element * element,Vector<LocalFrame * > * visitedFrames,Vector<LocalFrame * > * framesToVisit,Vector<KURL> * frameURLs,Vector<KURL> * resourceURLs)81 void retrieveResourcesForElement(Element* element,
82                                  Vector<LocalFrame*>* visitedFrames,
83                                  Vector<LocalFrame*>* framesToVisit,
84                                  Vector<KURL>* frameURLs,
85                                  Vector<KURL>* resourceURLs)
86 {
87     ASSERT(element);
88     // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
89     if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isHTMLEmbedElement(*element)) {
90         Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame();
91         if (frame && frame->isLocalFrame()) {
92             if (!visitedFrames->contains(toLocalFrame(frame)))
93                 framesToVisit->append(toLocalFrame(frame));
94             return;
95         }
96     }
97 
98     KURL url = getSubResourceURLFromElement(element);
99     if (url.isEmpty() || !url.isValid())
100         return; // No subresource for this node.
101 
102     // Ignore URLs that have a non-standard protocols. Since the FTP protocol
103     // does no have a cache mechanism, we skip it as well.
104     if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
105         return;
106 
107     if (!resourceURLs->contains(url))
108         resourceURLs->append(url);
109 }
110 
retrieveResourcesForFrame(LocalFrame * frame,const WebVector<WebCString> & supportedSchemes,Vector<LocalFrame * > * visitedFrames,Vector<LocalFrame * > * framesToVisit,Vector<KURL> * frameURLs,Vector<KURL> * resourceURLs)111 void retrieveResourcesForFrame(LocalFrame* frame,
112     const WebVector<WebCString>& supportedSchemes,
113     Vector<LocalFrame*>* visitedFrames,
114     Vector<LocalFrame*>* framesToVisit,
115     Vector<KURL>* frameURLs,
116     Vector<KURL>* resourceURLs)
117 {
118     KURL frameURL = frame->loader().documentLoader()->request().url();
119 
120     // If the frame's URL is invalid, ignore it, it is not retrievable.
121     if (!frameURL.isValid())
122         return;
123 
124     // Ignore frames from unsupported schemes.
125     bool isValidScheme = false;
126     for (size_t i = 0; i < supportedSchemes.size(); ++i) {
127         if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
128             isValidScheme = true;
129             break;
130         }
131     }
132     if (!isValidScheme)
133         return;
134 
135     // If we have already seen that frame, ignore it.
136     if (visitedFrames->contains(frame))
137         return;
138     visitedFrames->append(frame);
139     if (!frameURLs->contains(frameURL))
140         frameURLs->append(frameURL);
141 
142     // Now get the resources associated with each node of the document.
143     RefPtrWillBeRawPtr<HTMLAllCollection> allElements = frame->document()->all();
144     for (unsigned i = 0; i < allElements->length(); ++i) {
145         Element* element = allElements->item(i);
146         retrieveResourcesForElement(element,
147                                     visitedFrames, framesToVisit,
148                                     frameURLs, resourceURLs);
149     }
150 }
151 
152 } // namespace
153 
serialize(WebView * view,WebVector<WebPageSerializer::Resource> * resourcesParam)154 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
155 {
156     Vector<SerializedResource> resources;
157     PageSerializer serializer(&resources);
158     serializer.serialize(toWebViewImpl(view)->page());
159 
160     Vector<Resource> result;
161     for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
162         Resource resource;
163         resource.url = iter->url;
164         resource.mimeType = iter->mimeType.ascii();
165         // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
166         resource.data = WebCString(iter->data->data(), iter->data->size());
167         result.append(resource);
168     }
169 
170     *resourcesParam = result;
171 }
172 
serializePageToMHTML(Page * page,MHTMLArchive::EncodingPolicy encodingPolicy)173 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy)
174 {
175     Vector<SerializedResource> resources;
176     PageSerializer serializer(&resources);
177     serializer.serialize(page);
178     Document* document = page->deprecatedLocalMainFrame()->document();
179     return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType());
180 }
181 
serializeToMHTML(WebView * view)182 WebCString WebPageSerializer::serializeToMHTML(WebView* view)
183 {
184     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding);
185     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
186     return WebCString(mhtml->data(), mhtml->size());
187 }
188 
serializeToMHTMLUsingBinaryEncoding(WebView * view)189 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
190 {
191     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding);
192     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
193     return WebCString(mhtml->data(), mhtml->size());
194 }
195 
serialize(WebLocalFrame * frame,bool recursive,WebPageSerializerClient * client,const WebVector<WebURL> & links,const WebVector<WebString> & localPaths,const WebString & localDirectoryName)196 bool WebPageSerializer::serialize(WebLocalFrame* frame,
197                                   bool recursive,
198                                   WebPageSerializerClient* client,
199                                   const WebVector<WebURL>& links,
200                                   const WebVector<WebString>& localPaths,
201                                   const WebString& localDirectoryName)
202 {
203     WebPageSerializerImpl serializerImpl(
204         frame, recursive, client, links, localPaths, localDirectoryName);
205     return serializerImpl.serialize();
206 }
207 
retrieveAllResources(WebView * view,const WebVector<WebCString> & supportedSchemes,WebVector<WebURL> * resourceURLs,WebVector<WebURL> * frameURLs)208 bool WebPageSerializer::retrieveAllResources(WebView* view,
209                                              const WebVector<WebCString>& supportedSchemes,
210                                              WebVector<WebURL>* resourceURLs,
211                                              WebVector<WebURL>* frameURLs) {
212     WebLocalFrameImpl* mainFrame = toWebLocalFrameImpl(view->mainFrame());
213     if (!mainFrame)
214         return false;
215 
216     Vector<LocalFrame*> framesToVisit;
217     Vector<LocalFrame*> visitedFrames;
218     Vector<KURL> frameKURLs;
219     Vector<KURL> resourceKURLs;
220 
221     // Let's retrieve the resources from every frame in this page.
222     framesToVisit.append(mainFrame->frame());
223     while (!framesToVisit.isEmpty()) {
224         LocalFrame* frame = framesToVisit[0];
225         framesToVisit.remove(0);
226         retrieveResourcesForFrame(frame, supportedSchemes,
227                                   &visitedFrames, &framesToVisit,
228                                   &frameKURLs, &resourceKURLs);
229     }
230 
231     // Converts the results to WebURLs.
232     WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
233     for (size_t i = 0; i < resourceKURLs.size(); ++i) {
234         resultResourceURLs[i] = resourceKURLs[i];
235         // A frame's src can point to the same URL as another resource, keep the
236         // resource URL only in such cases.
237         size_t index = frameKURLs.find(resourceKURLs[i]);
238         if (index != kNotFound)
239             frameKURLs.remove(index);
240     }
241     *resourceURLs = resultResourceURLs;
242     WebVector<WebURL> resultFrameURLs(frameKURLs.size());
243     for (size_t i = 0; i < frameKURLs.size(); ++i)
244         resultFrameURLs[i] = frameKURLs[i];
245     *frameURLs = resultFrameURLs;
246 
247     return true;
248 }
249 
generateMetaCharsetDeclaration(const WebString & charset)250 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
251 {
252     String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
253     return charsetString;
254 }
255 
generateMarkOfTheWebDeclaration(const WebURL & url)256 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
257 {
258     return String::format("\n<!-- saved from url=(%04d)%s -->\n",
259                           static_cast<int>(url.spec().length()),
260                           url.spec().data());
261 }
262 
generateBaseTagDeclaration(const WebString & baseTarget)263 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
264 {
265     if (baseTarget.isEmpty())
266         return String("<base href=\".\">");
267     String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
268     return baseString;
269 }
270 
271 } // namespace blink
272