1 /*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 // How we handle the base tag better.
32 // Current status:
33 // At now the normal way we use to handling base tag is
34 // a) For those links which have corresponding local saved files, such as
35 // savable CSS, JavaScript files, they will be written to relative URLs which
36 // point to local saved file. Why those links can not be resolved as absolute
37 // file URLs, because if they are resolved as absolute URLs, after moving the
38 // file location from one directory to another directory, the file URLs will
39 // be dead links.
40 // b) For those links which have not corresponding local saved files, such as
41 // links in A, AREA tags, they will be resolved as absolute URLs.
42 // c) We comment all base tags when serialzing DOM for the page.
43 // FireFox also uses above way to handle base tag.
44 //
45 // Problem:
46 // This way can not handle the following situation:
47 // the base tag is written by JavaScript.
48 // For example. The page "www.yahoo.com" use
49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50 // of page when loading page. So when saving page as completed-HTML, we assume
51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52 // completed-HTML page, then the JavaScript will insert a base tag
53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54 // local saved resource files will be resolved as
55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource
56 // files can not be loaded correctly. Also the page will be rendered ugly since
57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58 // files can not be fetched.
59 // Now FireFox, IE and WebKit based Browser all have this problem.
60 //
61 // Solution:
62 // My solution is that we comment old base tag and write new base tag:
63 // <base href="." ...> after the previous commented base tag. In WebKit, it
64 // always uses the latest "href" attribute of base tag to set document's base
65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
66 // write a new base tag <base href="."> after the previous commented base tag.
67 // The new added base tag can help engine to locate correct base URL for
68 // correctly loading local saved resource files. Also I think we need to inherit
69 // the base target value from document object when appending new base tag.
70 // If there are multiple base tags in original document, we will comment all old
71 // base tags and append new base tag after each old base tag because we do not
72 // know those old base tags are original content or added by JavaScript. If
73 // they are added by JavaScript, it means when loading saved page, the script(s)
74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
75 // override the incorrect base URL and make sure we alway load correct local
76 // saved resource files.
77
78 #include "config.h"
79 #include "WebPageSerializerImpl.h"
80
81 #include "Document.h"
82 #include "DocumentLoader.h"
83 #include "DocumentType.h"
84 #include "Element.h"
85 #include "FrameLoader.h"
86 #include "HTMLAllCollection.h"
87 #include "HTMLElement.h"
88 #include "HTMLFormElement.h"
89 #include "HTMLMetaElement.h"
90 #include "HTMLNames.h"
91 #include "KURL.h"
92 #include "TextEncoding.h"
93 #include "markup.h"
94
95 #include "DOMUtilitiesPrivate.h"
96 #include "WebFrameImpl.h"
97 #include "WebURL.h"
98 #include "WebVector.h"
99
100 using namespace WebCore;
101
102 namespace WebKit {
103
104 // Maximum length of data buffer which is used to temporary save generated
105 // html content data. This is a soft limit which might be passed if a very large
106 // contegious string is found in the page.
107 static const unsigned dataBufferCapacity = 65536;
108
SerializeDomParam(const KURL & url,const TextEncoding & textEncoding,Document * document,const String & directoryName)109 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
110 const TextEncoding& textEncoding,
111 Document* document,
112 const String& directoryName)
113 : url(url)
114 , textEncoding(textEncoding)
115 , document(document)
116 , directoryName(directoryName)
117 , isHTMLDocument(document->isHTMLDocument())
118 , haveSeenDocType(false)
119 , haveAddedCharsetDeclaration(false)
120 , skipMetaElement(0)
121 , isInScriptOrStyleTag(false)
122 , haveAddedXMLProcessingDirective(false)
123 , haveAddedContentsBeforeEnd(false)
124 {
125 }
126
preActionBeforeSerializeOpenTag(const Element * element,SerializeDomParam * param,bool * needSkip)127 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
128 const Element* element, SerializeDomParam* param, bool* needSkip)
129 {
130 StringBuilder result;
131
132 *needSkip = false;
133 if (param->isHTMLDocument) {
134 // Skip the open tag of original META tag which declare charset since we
135 // have overrided the META which have correct charset declaration after
136 // serializing open tag of HEAD element.
137 if (element->hasTagName(HTMLNames::metaTag)) {
138 const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element);
139 // Check whether the META tag has declared charset or not.
140 String equiv = meta->httpEquiv();
141 if (equalIgnoringCase(equiv, "content-type")) {
142 String content = meta->content();
143 if (content.length() && content.contains("charset", false)) {
144 // Find META tag declared charset, we need to skip it when
145 // serializing DOM.
146 param->skipMetaElement = element;
147 *needSkip = true;
148 }
149 }
150 } else if (element->hasTagName(HTMLNames::htmlTag)) {
151 // Check something before processing the open tag of HEAD element.
152 // First we add doc type declaration if original document has it.
153 if (!param->haveSeenDocType) {
154 param->haveSeenDocType = true;
155 result.append(createMarkup(param->document->doctype()));
156 }
157
158 // Add MOTW declaration before html tag.
159 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
160 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
161 } else if (element->hasTagName(HTMLNames::baseTag)) {
162 // Comment the BASE tag when serializing dom.
163 result.append("<!--");
164 }
165 } else {
166 // Write XML declaration.
167 if (!param->haveAddedXMLProcessingDirective) {
168 param->haveAddedXMLProcessingDirective = true;
169 // Get encoding info.
170 String xmlEncoding = param->document->xmlEncoding();
171 if (xmlEncoding.isEmpty())
172 xmlEncoding = param->document->loader()->writer()->encoding();
173 if (xmlEncoding.isEmpty())
174 xmlEncoding = UTF8Encoding().name();
175 result.append("<?xml version=\"");
176 result.append(param->document->xmlVersion());
177 result.append("\" encoding=\"");
178 result.append(xmlEncoding);
179 if (param->document->xmlStandalone())
180 result.append("\" standalone=\"yes");
181 result.append("\"?>\n");
182 }
183 // Add doc type declaration if original document has it.
184 if (!param->haveSeenDocType) {
185 param->haveSeenDocType = true;
186 result.append(createMarkup(param->document->doctype()));
187 }
188 }
189 return result.toString();
190 }
191
postActionAfterSerializeOpenTag(const Element * element,SerializeDomParam * param)192 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
193 const Element* element, SerializeDomParam* param)
194 {
195 StringBuilder result;
196
197 param->haveAddedContentsBeforeEnd = false;
198 if (!param->isHTMLDocument)
199 return result.toString();
200 // Check after processing the open tag of HEAD element
201 if (!param->haveAddedCharsetDeclaration
202 && element->hasTagName(HTMLNames::headTag)) {
203 param->haveAddedCharsetDeclaration = true;
204 // Check meta element. WebKit only pre-parse the first 512 bytes
205 // of the document. If the whole <HEAD> is larger and meta is the
206 // end of head part, then this kind of pages aren't decoded correctly
207 // because of this issue. So when we serialize the DOM, we need to
208 // make sure the meta will in first child of head tag.
209 // See http://bugs.webkit.org/show_bug.cgi?id=16621.
210 // First we generate new content for writing correct META element.
211 result.append(WebPageSerializer::generateMetaCharsetDeclaration(
212 String(param->textEncoding.name())));
213
214 param->haveAddedContentsBeforeEnd = true;
215 // Will search each META which has charset declaration, and skip them all
216 // in PreActionBeforeSerializeOpenTag.
217 } else if (element->hasTagName(HTMLNames::scriptTag)
218 || element->hasTagName(HTMLNames::styleTag)) {
219 param->isInScriptOrStyleTag = true;
220 }
221
222 return result.toString();
223 }
224
preActionBeforeSerializeEndTag(const Element * element,SerializeDomParam * param,bool * needSkip)225 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
226 const Element* element, SerializeDomParam* param, bool* needSkip)
227 {
228 String result;
229
230 *needSkip = false;
231 if (!param->isHTMLDocument)
232 return result;
233 // Skip the end tag of original META tag which declare charset.
234 // Need not to check whether it's META tag since we guarantee
235 // skipMetaElement is definitely META tag if it's not 0.
236 if (param->skipMetaElement == element)
237 *needSkip = true;
238 else if (element->hasTagName(HTMLNames::scriptTag)
239 || element->hasTagName(HTMLNames::styleTag)) {
240 ASSERT(param->isInScriptOrStyleTag);
241 param->isInScriptOrStyleTag = false;
242 }
243
244 return result;
245 }
246
247 // After we finish serializing end tag of a element, we give the target
248 // element a chance to do some post work to add some additional data.
postActionAfterSerializeEndTag(const Element * element,SerializeDomParam * param)249 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
250 const Element* element, SerializeDomParam* param)
251 {
252 StringBuilder result;
253
254 if (!param->isHTMLDocument)
255 return result.toString();
256 // Comment the BASE tag when serializing DOM.
257 if (element->hasTagName(HTMLNames::baseTag)) {
258 result.append("-->");
259 // Append a new base tag declaration.
260 result.append(WebPageSerializer::generateBaseTagDeclaration(
261 param->document->baseTarget()));
262 }
263
264 return result.toString();
265 }
266
saveHTMLContentToBuffer(const String & result,SerializeDomParam * param)267 void WebPageSerializerImpl::saveHTMLContentToBuffer(
268 const String& result, SerializeDomParam* param)
269 {
270 m_dataBuffer.append(result);
271 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
272 param,
273 DoNotForceFlush);
274 }
275
encodeAndFlushBuffer(WebPageSerializerClient::PageSerializationStatus status,SerializeDomParam * param,FlushOption flushOption)276 void WebPageSerializerImpl::encodeAndFlushBuffer(
277 WebPageSerializerClient::PageSerializationStatus status,
278 SerializeDomParam* param,
279 FlushOption flushOption)
280 {
281 // Data buffer is not full nor do we want to force flush.
282 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
283 return;
284
285 String content = m_dataBuffer.toString();
286 m_dataBuffer = StringBuilder();
287
288 // Convert the unicode content to target encoding
289 CString encodedContent = param->textEncoding.encode(
290 content.characters(), content.length(), EntitiesForUnencodables);
291
292 // Send result to the client.
293 m_client->didSerializeDataForFrame(param->url,
294 WebCString(encodedContent.data(), encodedContent.length()),
295 status);
296 }
297
openTagToString(Element * element,SerializeDomParam * param)298 void WebPageSerializerImpl::openTagToString(Element* element,
299 SerializeDomParam* param)
300 {
301 // FIXME: use StringBuilder instead of String.
302 bool needSkip;
303 // Do pre action for open tag.
304 String result = preActionBeforeSerializeOpenTag(element, param, &needSkip);
305 if (needSkip)
306 return;
307 // Add open tag
308 result += "<" + element->nodeName().lower();
309 // Go through all attributes and serialize them.
310 const NamedNodeMap *attrMap = element->attributes(true);
311 if (attrMap) {
312 unsigned numAttrs = attrMap->length();
313 for (unsigned i = 0; i < numAttrs; i++) {
314 result += " ";
315 // Add attribute pair
316 const Attribute *attribute = attrMap->attributeItem(i);
317 result += attribute->name().toString();
318 result += "=\"";
319 if (!attribute->value().isEmpty()) {
320 const String& attrValue = attribute->value();
321
322 // Check whether we need to replace some resource links
323 // with local resource paths.
324 const QualifiedName& attrName = attribute->name();
325 if (elementHasLegalLinkAttribute(element, attrName)) {
326 // For links start with "javascript:", we do not change it.
327 if (attrValue.startsWith("javascript:", false))
328 result += attrValue;
329 else {
330 // Get the absolute link
331 WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerElement(element);
332 String completeURL = subFrame ? subFrame->frame()->document()->url() :
333 param->document->completeURL(attrValue);
334 // Check whether we have local files for those link.
335 if (m_localLinks.contains(completeURL)) {
336 if (!param->directoryName.isEmpty())
337 result += "./" + param->directoryName + "/";
338 result += m_localLinks.get(completeURL);
339 } else
340 result += completeURL;
341 }
342 } else {
343 if (param->isHTMLDocument)
344 result += m_htmlEntities.convertEntitiesInString(attrValue);
345 else
346 result += m_xmlEntities.convertEntitiesInString(attrValue);
347 }
348 }
349 result += "\"";
350 }
351 }
352
353 // Do post action for open tag.
354 String addedContents = postActionAfterSerializeOpenTag(element, param);
355 // Complete the open tag for element when it has child/children.
356 if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd)
357 result += ">";
358 // Append the added contents generate in post action of open tag.
359 result += addedContents;
360 // Save the result to data buffer.
361 saveHTMLContentToBuffer(result, param);
362 }
363
364 // Serialize end tag of an specified element.
endTagToString(Element * element,SerializeDomParam * param)365 void WebPageSerializerImpl::endTagToString(Element* element,
366 SerializeDomParam* param)
367 {
368 bool needSkip;
369 // Do pre action for end tag.
370 String result = preActionBeforeSerializeEndTag(element,
371 param,
372 &needSkip);
373 if (needSkip)
374 return;
375 // Write end tag when element has child/children.
376 if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) {
377 result += "</";
378 result += element->nodeName().lower();
379 result += ">";
380 } else {
381 // Check whether we have to write end tag for empty element.
382 if (param->isHTMLDocument) {
383 result += ">";
384 // FIXME: This code is horribly wrong. WebPageSerializerImpl must die.
385 if (!static_cast<const HTMLElement*>(element)->ieForbidsInsertHTML()) {
386 // We need to write end tag when it is required.
387 result += "</";
388 result += element->nodeName().lower();
389 result += ">";
390 }
391 } else {
392 // For xml base document.
393 result += " />";
394 }
395 }
396 // Do post action for end tag.
397 result += postActionAfterSerializeEndTag(element, param);
398 // Save the result to data buffer.
399 saveHTMLContentToBuffer(result, param);
400 }
401
buildContentForNode(Node * node,SerializeDomParam * param)402 void WebPageSerializerImpl::buildContentForNode(Node* node,
403 SerializeDomParam* param)
404 {
405 switch (node->nodeType()) {
406 case Node::ELEMENT_NODE:
407 // Process open tag of element.
408 openTagToString(static_cast<Element*>(node), param);
409 // Walk through the children nodes and process it.
410 for (Node *child = node->firstChild(); child; child = child->nextSibling())
411 buildContentForNode(child, param);
412 // Process end tag of element.
413 endTagToString(static_cast<Element*>(node), param);
414 break;
415 case Node::TEXT_NODE:
416 saveHTMLContentToBuffer(createMarkup(node), param);
417 break;
418 case Node::ATTRIBUTE_NODE:
419 case Node::DOCUMENT_NODE:
420 case Node::DOCUMENT_FRAGMENT_NODE:
421 // Should not exist.
422 ASSERT_NOT_REACHED();
423 break;
424 // Document type node can be in DOM?
425 case Node::DOCUMENT_TYPE_NODE:
426 param->haveSeenDocType = true;
427 default:
428 // For other type node, call default action.
429 saveHTMLContentToBuffer(createMarkup(node), param);
430 break;
431 }
432 }
433
WebPageSerializerImpl(WebFrame * frame,bool recursiveSerialization,WebPageSerializerClient * client,const WebVector<WebURL> & links,const WebVector<WebString> & localPaths,const WebString & localDirectoryName)434 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
435 bool recursiveSerialization,
436 WebPageSerializerClient* client,
437 const WebVector<WebURL>& links,
438 const WebVector<WebString>& localPaths,
439 const WebString& localDirectoryName)
440 : m_client(client)
441 , m_recursiveSerialization(recursiveSerialization)
442 , m_framesCollected(false)
443 , m_localDirectoryName(localDirectoryName)
444 , m_htmlEntities(false)
445 , m_xmlEntities(true)
446 {
447 // Must specify available webframe.
448 ASSERT(frame);
449 m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame);
450 // Make sure we have non 0 client.
451 ASSERT(client);
452 // Build local resources map.
453 ASSERT(links.size() == localPaths.size());
454 for (size_t i = 0; i < links.size(); i++) {
455 KURL url = links[i];
456 ASSERT(!m_localLinks.contains(url.string()));
457 m_localLinks.set(url.string(), localPaths[i]);
458 }
459
460 ASSERT(m_dataBuffer.isEmpty());
461 }
462
collectTargetFrames()463 void WebPageSerializerImpl::collectTargetFrames()
464 {
465 ASSERT(!m_framesCollected);
466 m_framesCollected = true;
467
468 // First, process main frame.
469 m_frames.append(m_specifiedWebFrameImpl);
470 // Return now if user only needs to serialize specified frame, not including
471 // all sub-frames.
472 if (!m_recursiveSerialization)
473 return;
474 // Collect all frames inside the specified frame.
475 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
476 WebFrameImpl* currentFrame = m_frames[i];
477 // Get current using document.
478 Document* currentDoc = currentFrame->frame()->document();
479 // Go through sub-frames.
480 RefPtr<HTMLAllCollection> all = currentDoc->all();
481 for (Node* node = all->firstItem(); node; node = all->nextItem()) {
482 if (!node->isHTMLElement())
483 continue;
484 Element* element = static_cast<Element*>(node);
485 WebFrameImpl* webFrame =
486 WebFrameImpl::fromFrameOwnerElement(element);
487 if (webFrame)
488 m_frames.append(webFrame);
489 }
490 }
491 }
492
serialize()493 bool WebPageSerializerImpl::serialize()
494 {
495 if (!m_framesCollected)
496 collectTargetFrames();
497
498 bool didSerialization = false;
499 KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url();
500
501 for (unsigned i = 0; i < m_frames.size(); ++i) {
502 WebFrameImpl* webFrame = m_frames[i];
503 Document* document = webFrame->frame()->document();
504 const KURL& url = document->url();
505
506 if (!url.isValid() || !m_localLinks.contains(url.string()))
507 continue;
508
509 didSerialization = true;
510
511 String encoding = document->loader()->writer()->encoding();
512 const TextEncoding& textEncoding = encoding.isEmpty() ? UTF8Encoding() : TextEncoding(encoding);
513 String directoryName = url == mainURL ? m_localDirectoryName : "";
514
515 SerializeDomParam param(url, textEncoding, document, directoryName);
516
517 Element* documentElement = document->documentElement();
518 if (documentElement)
519 buildContentForNode(documentElement, ¶m);
520
521 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush);
522 }
523
524 ASSERT(m_dataBuffer.isEmpty());
525 m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
526 return didSerialization;
527 }
528
529 } // namespace WebKit
530