1 /*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 // How we handle the base tag better.
32 // Current status:
33 // At now the normal way we use to handling base tag is
34 // a) For those links which have corresponding local saved files, such as
35 // savable CSS, JavaScript files, they will be written to relative URLs which
36 // point to local saved file. Why those links can not be resolved as absolute
37 // file URLs, because if they are resolved as absolute URLs, after moving the
38 // file location from one directory to another directory, the file URLs will
39 // be dead links.
40 // b) For those links which have not corresponding local saved files, such as
41 // links in A, AREA tags, they will be resolved as absolute URLs.
42 // c) We comment all base tags when serialzing DOM for the page.
43 // FireFox also uses above way to handle base tag.
44 //
45 // Problem:
46 // This way can not handle the following situation:
47 // the base tag is written by JavaScript.
48 // For example. The page "www.yahoo.com" use
49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50 // of page when loading page. So when saving page as completed-HTML, we assume
51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52 // completed-HTML page, then the JavaScript will insert a base tag
53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54 // local saved resource files will be resolved as
55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource
56 // files can not be loaded correctly. Also the page will be rendered ugly since
57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58 // files can not be fetched.
59 // Now FireFox, IE and WebKit based Browser all have this problem.
60 //
61 // Solution:
62 // My solution is that we comment old base tag and write new base tag:
63 // <base href="." ...> after the previous commented base tag. In WebKit, it
64 // always uses the latest "href" attribute of base tag to set document's base
65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
66 // write a new base tag <base href="."> after the previous commented base tag.
67 // The new added base tag can help engine to locate correct base URL for
68 // correctly loading local saved resource files. Also I think we need to inherit
69 // the base target value from document object when appending new base tag.
70 // If there are multiple base tags in original document, we will comment all old
71 // base tags and append new base tag after each old base tag because we do not
72 // know those old base tags are original content or added by JavaScript. If
73 // they are added by JavaScript, it means when loading saved page, the script(s)
74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
75 // override the incorrect base URL and make sure we alway load correct local
76 // saved resource files.
77
78 #include "config.h"
79 #include "web/WebPageSerializerImpl.h"
80
81 #include "core/HTMLNames.h"
82 #include "core/dom/Document.h"
83 #include "core/dom/DocumentType.h"
84 #include "core/dom/Element.h"
85 #include "core/editing/markup.h"
86 #include "core/html/HTMLAllCollection.h"
87 #include "core/html/HTMLElement.h"
88 #include "core/html/HTMLFormElement.h"
89 #include "core/html/HTMLHtmlElement.h"
90 #include "core/html/HTMLMetaElement.h"
91 #include "core/loader/DocumentLoader.h"
92 #include "core/loader/FrameLoader.h"
93 #include "public/platform/WebVector.h"
94 #include "web/WebLocalFrameImpl.h"
95 #include "wtf/text/TextEncoding.h"
96
97 namespace blink {
98
99 // Maximum length of data buffer which is used to temporary save generated
100 // html content data. This is a soft limit which might be passed if a very large
101 // contegious string is found in the page.
102 static const unsigned dataBufferCapacity = 65536;
103
SerializeDomParam(const KURL & url,const WTF::TextEncoding & textEncoding,Document * document,const String & directoryName)104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
105 const WTF::TextEncoding& textEncoding,
106 Document* document,
107 const String& directoryName)
108 : url(url)
109 , textEncoding(textEncoding)
110 , document(document)
111 , directoryName(directoryName)
112 , isHTMLDocument(document->isHTMLDocument())
113 , haveSeenDocType(false)
114 , haveAddedCharsetDeclaration(false)
115 , skipMetaElement(0)
116 , isInScriptOrStyleTag(false)
117 , haveAddedXMLProcessingDirective(false)
118 , haveAddedContentsBeforeEnd(false)
119 {
120 }
121
preActionBeforeSerializeOpenTag(const Element * element,SerializeDomParam * param,bool * needSkip)122 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
123 const Element* element, SerializeDomParam* param, bool* needSkip)
124 {
125 StringBuilder result;
126
127 *needSkip = false;
128 if (param->isHTMLDocument) {
129 // Skip the open tag of original META tag which declare charset since we
130 // have overrided the META which have correct charset declaration after
131 // serializing open tag of HEAD element.
132 ASSERT(element);
133 if (isHTMLMetaElement(*element)) {
134 const HTMLMetaElement& meta = toHTMLMetaElement(*element);
135 // Check whether the META tag has declared charset or not.
136 String equiv = meta.httpEquiv();
137 if (equalIgnoringCase(equiv, "content-type")) {
138 String content = meta.content();
139 if (content.length() && content.contains("charset", false)) {
140 // Find META tag declared charset, we need to skip it when
141 // serializing DOM.
142 param->skipMetaElement = element;
143 *needSkip = true;
144 }
145 }
146 } else if (isHTMLHtmlElement(*element)) {
147 // Check something before processing the open tag of HEAD element.
148 // First we add doc type declaration if original document has it.
149 if (!param->haveSeenDocType) {
150 param->haveSeenDocType = true;
151 result.append(createMarkup(param->document->doctype()));
152 }
153
154 // Add MOTW declaration before html tag.
155 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
156 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
157 } else if (isHTMLBaseElement(*element)) {
158 // Comment the BASE tag when serializing dom.
159 result.appendLiteral("<!--");
160 }
161 } else {
162 // Write XML declaration.
163 if (!param->haveAddedXMLProcessingDirective) {
164 param->haveAddedXMLProcessingDirective = true;
165 // Get encoding info.
166 String xmlEncoding = param->document->xmlEncoding();
167 if (xmlEncoding.isEmpty())
168 xmlEncoding = param->document->encodingName();
169 if (xmlEncoding.isEmpty())
170 xmlEncoding = UTF8Encoding().name();
171 result.appendLiteral("<?xml version=\"");
172 result.append(param->document->xmlVersion());
173 result.appendLiteral("\" encoding=\"");
174 result.append(xmlEncoding);
175 if (param->document->xmlStandalone())
176 result.appendLiteral("\" standalone=\"yes");
177 result.appendLiteral("\"?>\n");
178 }
179 // Add doc type declaration if original document has it.
180 if (!param->haveSeenDocType) {
181 param->haveSeenDocType = true;
182 result.append(createMarkup(param->document->doctype()));
183 }
184 }
185 return result.toString();
186 }
187
postActionAfterSerializeOpenTag(const Element * element,SerializeDomParam * param)188 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
189 const Element* element, SerializeDomParam* param)
190 {
191 StringBuilder result;
192
193 param->haveAddedContentsBeforeEnd = false;
194 if (!param->isHTMLDocument)
195 return result.toString();
196 // Check after processing the open tag of HEAD element
197 if (!param->haveAddedCharsetDeclaration
198 && isHTMLHeadElement(*element)) {
199 param->haveAddedCharsetDeclaration = true;
200 // Check meta element. WebKit only pre-parse the first 512 bytes
201 // of the document. If the whole <HEAD> is larger and meta is the
202 // end of head part, then this kind of pages aren't decoded correctly
203 // because of this issue. So when we serialize the DOM, we need to
204 // make sure the meta will in first child of head tag.
205 // See http://bugs.webkit.org/show_bug.cgi?id=16621.
206 // First we generate new content for writing correct META element.
207 result.append(WebPageSerializer::generateMetaCharsetDeclaration(
208 String(param->textEncoding.name())));
209
210 param->haveAddedContentsBeforeEnd = true;
211 // Will search each META which has charset declaration, and skip them all
212 // in PreActionBeforeSerializeOpenTag.
213 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
214 param->isInScriptOrStyleTag = true;
215 }
216
217 return result.toString();
218 }
219
preActionBeforeSerializeEndTag(const Element * element,SerializeDomParam * param,bool * needSkip)220 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
221 const Element* element, SerializeDomParam* param, bool* needSkip)
222 {
223 String result;
224
225 *needSkip = false;
226 if (!param->isHTMLDocument)
227 return result;
228 // Skip the end tag of original META tag which declare charset.
229 // Need not to check whether it's META tag since we guarantee
230 // skipMetaElement is definitely META tag if it's not 0.
231 if (param->skipMetaElement == element) {
232 *needSkip = true;
233 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
234 ASSERT(param->isInScriptOrStyleTag);
235 param->isInScriptOrStyleTag = false;
236 }
237
238 return result;
239 }
240
241 // After we finish serializing end tag of a element, we give the target
242 // element a chance to do some post work to add some additional data.
postActionAfterSerializeEndTag(const Element * element,SerializeDomParam * param)243 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
244 const Element* element, SerializeDomParam* param)
245 {
246 StringBuilder result;
247
248 if (!param->isHTMLDocument)
249 return result.toString();
250 // Comment the BASE tag when serializing DOM.
251 if (isHTMLBaseElement(*element)) {
252 result.appendLiteral("-->");
253 // Append a new base tag declaration.
254 result.append(WebPageSerializer::generateBaseTagDeclaration(
255 param->document->baseTarget()));
256 }
257
258 return result.toString();
259 }
260
saveHTMLContentToBuffer(const String & result,SerializeDomParam * param)261 void WebPageSerializerImpl::saveHTMLContentToBuffer(
262 const String& result, SerializeDomParam* param)
263 {
264 m_dataBuffer.append(result);
265 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
266 param,
267 DoNotForceFlush);
268 }
269
encodeAndFlushBuffer(WebPageSerializerClient::PageSerializationStatus status,SerializeDomParam * param,FlushOption flushOption)270 void WebPageSerializerImpl::encodeAndFlushBuffer(
271 WebPageSerializerClient::PageSerializationStatus status,
272 SerializeDomParam* param,
273 FlushOption flushOption)
274 {
275 // Data buffer is not full nor do we want to force flush.
276 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
277 return;
278
279 String content = m_dataBuffer.toString();
280 m_dataBuffer.clear();
281
282 CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
283
284 // Send result to the client.
285 m_client->didSerializeDataForFrame(param->url,
286 WebCString(encodedContent.data(), encodedContent.length()),
287 status);
288 }
289
openTagToString(Element * element,SerializeDomParam * param)290 void WebPageSerializerImpl::openTagToString(Element* element,
291 SerializeDomParam* param)
292 {
293 bool needSkip;
294 StringBuilder result;
295 // Do pre action for open tag.
296 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
297 if (needSkip)
298 return;
299 // Add open tag
300 result.append('<');
301 result.append(element->nodeName().lower());
302 // Go through all attributes and serialize them.
303 AttributeCollection attributes = element->attributes();
304 AttributeCollection::iterator end = attributes.end();
305 for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {
306 result.append(' ');
307 // Add attribute pair
308 result.append(it->name().toString());
309 result.appendLiteral("=\"");
310 if (!it->value().isEmpty()) {
311 const String& attrValue = it->value();
312
313 // Check whether we need to replace some resource links
314 // with local resource paths.
315 const QualifiedName& attrName = it->name();
316 if (element->hasLegalLinkAttribute(attrName)) {
317 // For links start with "javascript:", we do not change it.
318 if (attrValue.startsWith("javascript:", false)) {
319 result.append(attrValue);
320 } else {
321 // Get the absolute link
322 WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
323 String completeURL = subFrame ? subFrame->frame()->document()->url() :
324 param->document->completeURL(attrValue);
325 // Check whether we have local files for those link.
326 if (m_localLinks.contains(completeURL)) {
327 if (!param->directoryName.isEmpty()) {
328 result.appendLiteral("./");
329 result.append(param->directoryName);
330 result.append('/');
331 }
332 result.append(m_localLinks.get(completeURL));
333 } else {
334 result.append(completeURL);
335 }
336 }
337 } else {
338 if (param->isHTMLDocument)
339 result.append(m_htmlEntities.convertEntitiesInString(attrValue));
340 else
341 result.append(m_xmlEntities.convertEntitiesInString(attrValue));
342 }
343 }
344 result.append('\"');
345 }
346
347 // Do post action for open tag.
348 String addedContents = postActionAfterSerializeOpenTag(element, param);
349 // Complete the open tag for element when it has child/children.
350 if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
351 result.append('>');
352 // Append the added contents generate in post action of open tag.
353 result.append(addedContents);
354 // Save the result to data buffer.
355 saveHTMLContentToBuffer(result.toString(), param);
356 }
357
358 // Serialize end tag of an specified element.
endTagToString(Element * element,SerializeDomParam * param)359 void WebPageSerializerImpl::endTagToString(Element* element,
360 SerializeDomParam* param)
361 {
362 bool needSkip;
363 StringBuilder result;
364 // Do pre action for end tag.
365 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
366 if (needSkip)
367 return;
368 // Write end tag when element has child/children.
369 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
370 result.appendLiteral("</");
371 result.append(element->nodeName().lower());
372 result.append('>');
373 } else {
374 // Check whether we have to write end tag for empty element.
375 if (param->isHTMLDocument) {
376 result.append('>');
377 // FIXME: This code is horribly wrong. WebPageSerializerImpl must die.
378 if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
379 // We need to write end tag when it is required.
380 result.appendLiteral("</");
381 result.append(element->nodeName().lower());
382 result.append('>');
383 }
384 } else {
385 // For xml base document.
386 result.appendLiteral(" />");
387 }
388 }
389 // Do post action for end tag.
390 result.append(postActionAfterSerializeEndTag(element, param));
391 // Save the result to data buffer.
392 saveHTMLContentToBuffer(result.toString(), param);
393 }
394
buildContentForNode(Node * node,SerializeDomParam * param)395 void WebPageSerializerImpl::buildContentForNode(Node* node,
396 SerializeDomParam* param)
397 {
398 switch (node->nodeType()) {
399 case Node::ELEMENT_NODE:
400 // Process open tag of element.
401 openTagToString(toElement(node), param);
402 // Walk through the children nodes and process it.
403 for (Node *child = node->firstChild(); child; child = child->nextSibling())
404 buildContentForNode(child, param);
405 // Process end tag of element.
406 endTagToString(toElement(node), param);
407 break;
408 case Node::TEXT_NODE:
409 saveHTMLContentToBuffer(createMarkup(node), param);
410 break;
411 case Node::ATTRIBUTE_NODE:
412 case Node::DOCUMENT_NODE:
413 case Node::DOCUMENT_FRAGMENT_NODE:
414 // Should not exist.
415 ASSERT_NOT_REACHED();
416 break;
417 // Document type node can be in DOM?
418 case Node::DOCUMENT_TYPE_NODE:
419 param->haveSeenDocType = true;
420 default:
421 // For other type node, call default action.
422 saveHTMLContentToBuffer(createMarkup(node), param);
423 break;
424 }
425 }
426
WebPageSerializerImpl(WebFrame * frame,bool recursiveSerialization,WebPageSerializerClient * client,const WebVector<WebURL> & links,const WebVector<WebString> & localPaths,const WebString & localDirectoryName)427 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
428 bool recursiveSerialization,
429 WebPageSerializerClient* client,
430 const WebVector<WebURL>& links,
431 const WebVector<WebString>& localPaths,
432 const WebString& localDirectoryName)
433 : m_client(client)
434 , m_recursiveSerialization(recursiveSerialization)
435 , m_framesCollected(false)
436 , m_localDirectoryName(localDirectoryName)
437 , m_htmlEntities(false)
438 , m_xmlEntities(true)
439 {
440 // Must specify available webframe.
441 ASSERT(frame);
442 m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
443 // Make sure we have non 0 client.
444 ASSERT(client);
445 // Build local resources map.
446 ASSERT(links.size() == localPaths.size());
447 for (size_t i = 0; i < links.size(); i++) {
448 KURL url = links[i];
449 ASSERT(!m_localLinks.contains(url.string()));
450 m_localLinks.set(url.string(), localPaths[i]);
451 }
452
453 ASSERT(m_dataBuffer.isEmpty());
454 }
455
collectTargetFrames()456 void WebPageSerializerImpl::collectTargetFrames()
457 {
458 ASSERT(!m_framesCollected);
459 m_framesCollected = true;
460
461 // First, process main frame.
462 m_frames.append(m_specifiedWebLocalFrameImpl);
463 // Return now if user only needs to serialize specified frame, not including
464 // all sub-frames.
465 if (!m_recursiveSerialization)
466 return;
467 // Collect all frames inside the specified frame.
468 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
469 WebLocalFrameImpl* currentFrame = m_frames[i];
470 // Get current using document.
471 Document* currentDoc = currentFrame->frame()->document();
472 // Go through sub-frames.
473 RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();
474
475 for (unsigned i = 0; Element* element = all->item(i); ++i) {
476 if (!element->isHTMLElement())
477 continue;
478 WebLocalFrameImpl* webFrame =
479 WebLocalFrameImpl::fromFrameOwnerElement(element);
480 if (webFrame)
481 m_frames.append(webFrame);
482 }
483 }
484 }
485
serialize()486 bool WebPageSerializerImpl::serialize()
487 {
488 if (!m_framesCollected)
489 collectTargetFrames();
490
491 bool didSerialization = false;
492 KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();
493
494 for (unsigned i = 0; i < m_frames.size(); ++i) {
495 WebLocalFrameImpl* webFrame = m_frames[i];
496 Document* document = webFrame->frame()->document();
497 const KURL& url = document->url();
498
499 if (!url.isValid() || !m_localLinks.contains(url.string()))
500 continue;
501
502 didSerialization = true;
503
504 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
505 String directoryName = url == mainURL ? m_localDirectoryName : "";
506
507 SerializeDomParam param(url, textEncoding, document, directoryName);
508
509 Element* documentElement = document->documentElement();
510 if (documentElement)
511 buildContentForNode(documentElement, ¶m);
512
513 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush);
514 }
515
516 ASSERT(m_dataBuffer.isEmpty());
517 m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
518 return didSerialization;
519 }
520
521 } // namespace blink
522