• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/compiler_specific.h"
6 #include "base/file_path.h"
7 #include "base/file_util.h"
8 #include "base/hash_tables.h"
9 #include "base/string_util.h"
10 #include "base/utf_string_conversions.h"
11 #include "net/base/net_util.h"
12 #include "net/url_request/url_request_context.h"
13 #include "third_party/WebKit/Source/WebKit/chromium/public/WebCString.h"
14 #include "third_party/WebKit/Source/WebKit/chromium/public/WebData.h"
15 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h"
16 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h"
17 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNode.h"
19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h"
20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeList.h"
21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializer.h"
22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializerClient.h"
23 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h"
24 #include "third_party/WebKit/Source/WebKit/chromium/public/WebURL.h"
25 #include "third_party/WebKit/Source/WebKit/chromium/public/WebVector.h"
26 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h"
27 #include "webkit/glue/dom_operations.h"
28 #include "webkit/glue/webkit_glue.h"
29 #include "webkit/tools/test_shell/simple_resource_loader_bridge.h"
30 #include "webkit/tools/test_shell/test_shell_test.h"
31 
32 using WebKit::WebCString;
33 using WebKit::WebData;
34 using WebKit::WebDocument;
35 using WebKit::WebElement;
36 using WebKit::WebFrame;
37 using WebKit::WebNode;
38 using WebKit::WebNodeCollection;
39 using WebKit::WebNodeList;
40 using WebKit::WebPageSerializer;
41 using WebKit::WebPageSerializerClient;
42 using WebKit::WebNode;
43 using WebKit::WebString;
44 using WebKit::WebURL;
45 using WebKit::WebView;
46 using WebKit::WebVector;
47 
48 namespace {
49 
50 // Iterate recursively over sub-frames to find one with with a given url.
FindSubFrameByURL(WebView * web_view,const GURL & url)51 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
52   if (!web_view->mainFrame())
53     return NULL;
54 
55   std::vector<WebFrame*> stack;
56   stack.push_back(web_view->mainFrame());
57 
58   while (!stack.empty()) {
59     WebFrame* current_frame = stack.back();
60     stack.pop_back();
61     if (GURL(current_frame->url()) == url)
62       return current_frame;
63     WebNodeCollection all = current_frame->document().all();
64     for (WebNode node = all.firstItem();
65          !node.isNull(); node = all.nextItem()) {
66       if (!node.isElementNode())
67         continue;
68       // Check frame tag and iframe tag
69       WebElement element = node.to<WebElement>();
70       if (!element.hasTagName("frame") && !element.hasTagName("iframe"))
71         continue;
72       WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
73       if (sub_frame)
74         stack.push_back(sub_frame);
75     }
76   }
77   return NULL;
78 }
79 
80 class DomSerializerTests : public TestShellTest,
81                            public WebPageSerializerClient {
82  public:
DomSerializerTests()83   DomSerializerTests()
84     : local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) { }
85 
86   // DomSerializerDelegate.
didSerializeDataForFrame(const WebURL & frame_web_url,const WebCString & data,PageSerializationStatus status)87   void didSerializeDataForFrame(const WebURL& frame_web_url,
88                                 const WebCString& data,
89                                 PageSerializationStatus status) {
90 
91     GURL frame_url(frame_web_url);
92     // If the all frames are finished saving, check all finish status
93     if (status == WebPageSerializerClient::AllFramesAreFinished) {
94       SerializationFinishStatusMap::iterator it =
95           serialization_finish_status_.begin();
96       for (; it != serialization_finish_status_.end(); ++it)
97         ASSERT_TRUE(it->second);
98       serialized_ = true;
99       return;
100     }
101 
102     // Check finish status of current frame.
103     SerializationFinishStatusMap::iterator it =
104         serialization_finish_status_.find(frame_url.spec());
105     // New frame, set initial status as false.
106     if (it == serialization_finish_status_.end())
107       serialization_finish_status_[frame_url.spec()] = false;
108 
109     it = serialization_finish_status_.find(frame_url.spec());
110     ASSERT_TRUE(it != serialization_finish_status_.end());
111     // In process frame, finish status should be false.
112     ASSERT_FALSE(it->second);
113 
114     // Add data to corresponding frame's content.
115     serialized_frame_map_[frame_url.spec()] += data.data();
116 
117     // Current frame is completed saving, change the finish status.
118     if (status == WebPageSerializerClient::CurrentFrameIsFinished)
119       it->second = true;
120   }
121 
HasSerializedFrame(const GURL & frame_url)122   bool HasSerializedFrame(const GURL& frame_url) {
123     return serialized_frame_map_.find(frame_url.spec()) !=
124            serialized_frame_map_.end();
125   }
126 
GetSerializedContentForFrame(const GURL & frame_url)127   const std::string& GetSerializedContentForFrame(
128       const GURL& frame_url) {
129     return serialized_frame_map_[frame_url.spec()];
130   }
131 
132   // Load web page according to specific URL.
LoadPageFromURL(const GURL & page_url)133   void LoadPageFromURL(const GURL& page_url) {
134     // Load the test file.
135     test_shell_->ResetTestController();
136     test_shell_->LoadURL(page_url);
137     test_shell_->WaitTestFinished();
138   }
139 
140   // Load web page according to input content and relative URLs within
141   // the document.
LoadContents(const std::string & contents,const GURL & base_url,const WebString encoding_info)142   void LoadContents(const std::string& contents,
143                     const GURL& base_url,
144                     const WebString encoding_info) {
145     test_shell_->ResetTestController();
146     // If input encoding is empty, use UTF-8 as default encoding.
147     if (encoding_info.isEmpty()) {
148       test_shell_->webView()->mainFrame()->loadHTMLString(contents, base_url);
149     } else {
150       WebData data(contents.data(), contents.length());
151 
152       // Do not use WebFrame.LoadHTMLString because it assumes that input
153       // html contents use UTF-8 encoding.
154       // TODO(darin): This should use WebFrame::loadData.
155       WebFrame* web_frame =
156           test_shell_->webView()->mainFrame();
157 
158       ASSERT_TRUE(web_frame != NULL);
159 
160       web_frame->loadData(data, "text/html", encoding_info, base_url);
161     }
162 
163     test_shell_->WaitTestFinished();
164   }
165 
166   // Serialize page DOM according to specific page URL. The parameter
167   // recursive_serialization indicates whether we will serialize all
168   // sub-frames.
SerializeDomForURL(const GURL & page_url,bool recursive_serialization)169   void SerializeDomForURL(const GURL& page_url,
170                           bool recursive_serialization) {
171     // Find corresponding WebFrame according to page_url.
172     WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(),
173                                             page_url);
174     ASSERT_TRUE(web_frame != NULL);
175     // Add input file URl to links_.
176     links_.assign(&page_url,1);
177     // Add dummy file path to local_path_.
178     WebString file_path = webkit_glue::FilePathStringToWebString(
179         FILE_PATH_LITERAL("c:\\dummy.htm"));
180     local_paths_.assign(&file_path, 1);
181     // Start serializing DOM.
182     bool result = WebPageSerializer::serialize(web_frame,
183        recursive_serialization,
184        static_cast<WebPageSerializerClient*>(this),
185        links_,
186        local_paths_,
187        webkit_glue::FilePathToWebString(local_directory_name_));
188     ASSERT_TRUE(result);
189     ASSERT_TRUE(serialized_);
190   }
191 
192  private:
193   // Map frame_url to corresponding serialized_content.
194   typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
195   SerializedFrameContentMap serialized_frame_map_;
196   // Map frame_url to corresponding status of serialization finish.
197   typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
198   SerializationFinishStatusMap serialization_finish_status_;
199   // Flag indicates whether the process of serializing DOM is finished or not.
200   bool serialized_;
201   // The links_ contain dummy original URLs of all saved links.
202   WebVector<WebURL> links_;
203   // The local_paths_ contain dummy corresponding local file paths of all saved
204   // links, which matched links_ one by one.
205   WebVector<WebString> local_paths_;
206   // The local_directory_name_ is dummy relative path of directory which
207   // contain all saved auxiliary files included all sub frames and resources.
208   const FilePath local_directory_name_;
209 
210  protected:
211   // testing::Test
SetUp()212   virtual void SetUp() {
213     TestShellTest::SetUp();
214     serialized_ = false;
215   }
216 
TearDown()217   virtual void TearDown() {
218     TestShellTest::TearDown();
219   }
220 };
221 
222 // Helper function that test whether the first node in the doc is a doc type
223 // node.
HasDocType(const WebDocument & doc)224 bool HasDocType(const WebDocument& doc) {
225   WebNode node = doc.firstChild();
226   if (node.isNull())
227     return false;
228   return node.nodeType() == WebNode::DocumentTypeNode;
229 }
230 
231 // Helper function for checking whether input node is META tag. Return true
232 // means it is META element, otherwise return false. The parameter charset_info
233 // return actual charset info if the META tag has charset declaration.
IsMetaElement(const WebNode & node,std::string & charset_info)234 bool IsMetaElement(const WebNode& node, std::string& charset_info) {
235   if (!node.isElementNode())
236     return false;
237   const WebElement meta = node.toConst<WebElement>();
238   if (!meta.hasTagName("meta"))
239     return false;
240   charset_info.erase(0, charset_info.length());
241   // Check the META charset declaration.
242   WebString httpEquiv = meta.getAttribute("http-equiv");
243   if (LowerCaseEqualsASCII(httpEquiv, "content-type")) {
244     std::string content = meta.getAttribute("content").utf8();
245     int pos = content.find("charset", 0);
246     if (pos > -1) {
247       // Add a dummy charset declaration to charset_info, which indicates this
248       // META tag has charset declaration although we do not get correct value
249       // yet.
250       charset_info.append("has-charset-declaration");
251       int remaining_length = content.length() - pos - 7;
252       if (!remaining_length)
253         return true;
254       int start_pos = pos + 7;
255       // Find "=" symbol.
256       while (remaining_length--)
257         if (content[start_pos++] == L'=')
258           break;
259       // Skip beginning space.
260       while (remaining_length) {
261         if (content[start_pos] > 0x0020)
262           break;
263         ++start_pos;
264         --remaining_length;
265       }
266       if (!remaining_length)
267         return true;
268       int end_pos = start_pos;
269       // Now we find out the start point of charset info. Search the end point.
270       while (remaining_length--) {
271         if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
272           break;
273         ++end_pos;
274       }
275       // Get actual charset info.
276       charset_info = content.substr(start_pos, end_pos - start_pos);
277       return true;
278     }
279   }
280   return true;
281 }
282 
283 // If original contents have document type, the serialized contents also have
284 // document type.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithDocType)285 TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) {
286   FilePath page_file_path = data_dir_;
287   page_file_path = page_file_path.AppendASCII("dom_serializer");
288   page_file_path = page_file_path.AppendASCII("youtube_1.htm");
289   GURL file_url = net::FilePathToFileURL(page_file_path);
290   ASSERT_TRUE(file_url.SchemeIsFile());
291   // Load the test file.
292   LoadPageFromURL(file_url);
293   // Make sure original contents have document type.
294   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
295   ASSERT_TRUE(web_frame != NULL);
296   WebDocument doc = web_frame->document();
297   ASSERT_TRUE(HasDocType(doc));
298   // Do serialization.
299   SerializeDomForURL(file_url, false);
300   // Load the serialized contents.
301   ASSERT_TRUE(HasSerializedFrame(file_url));
302   const std::string& serialized_contents =
303       GetSerializedContentForFrame(file_url);
304   LoadContents(serialized_contents, file_url,
305                web_frame->encoding());
306   // Make sure serialized contents still have document type.
307   web_frame = test_shell_->webView()->mainFrame();
308   doc = web_frame->document();
309   ASSERT_TRUE(HasDocType(doc));
310 }
311 
312 // If original contents do not have document type, the serialized contents
313 // also do not have document type.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithoutDocType)314 TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
315   FilePath page_file_path = data_dir_;
316   page_file_path = page_file_path.AppendASCII("dom_serializer");
317   page_file_path = page_file_path.AppendASCII("youtube_2.htm");
318   GURL file_url = net::FilePathToFileURL(page_file_path);
319   ASSERT_TRUE(file_url.SchemeIsFile());
320   // Load the test file.
321   LoadPageFromURL(file_url);
322   // Make sure original contents do not have document type.
323   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
324   ASSERT_TRUE(web_frame != NULL);
325   WebDocument doc = web_frame->document();
326   ASSERT_TRUE(!HasDocType(doc));
327   // Do serialization.
328   SerializeDomForURL(file_url, false);
329   // Load the serialized contents.
330   ASSERT_TRUE(HasSerializedFrame(file_url));
331   const std::string& serialized_contents =
332       GetSerializedContentForFrame(file_url);
333   LoadContents(serialized_contents, file_url,
334                web_frame->encoding());
335   // Make sure serialized contents do not have document type.
336   web_frame = test_shell_->webView()->mainFrame();
337   doc = web_frame->document();
338   ASSERT_TRUE(!HasDocType(doc));
339 }
340 
341 // Serialize XML document which has all 5 built-in entities. After
342 // finishing serialization, the serialized contents should be same
343 // with original XML document.
TEST_F(DomSerializerTests,SerializeXMLDocWithBuiltInEntities)344 TEST_F(DomSerializerTests, SerializeXMLDocWithBuiltInEntities) {
345   FilePath page_file_path = data_dir_;
346   page_file_path = page_file_path.AppendASCII("dom_serializer");
347   page_file_path = page_file_path.AppendASCII("note.xml");
348   // Read original contents for later comparison.
349   std::string original_contents;
350   ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents));
351   // Get file URL.
352   GURL file_url = net::FilePathToFileURL(page_file_path);
353   ASSERT_TRUE(file_url.SchemeIsFile());
354   // Load the test file.
355   LoadPageFromURL(file_url);
356   // Do serialization.
357   SerializeDomForURL(file_url, false);
358   // Compare the serialized contents with original contents.
359   ASSERT_TRUE(HasSerializedFrame(file_url));
360   const std::string& serialized_contents =
361       GetSerializedContentForFrame(file_url);
362   ASSERT_EQ(original_contents, serialized_contents);
363 }
364 
365 // When serializing DOM, we add MOTW declaration before html tag.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithAddingMOTW)366 TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
367   FilePath page_file_path = data_dir_;
368   page_file_path = page_file_path.AppendASCII("dom_serializer");
369   page_file_path = page_file_path.AppendASCII("youtube_2.htm");
370   // Read original contents for later comparison .
371   std::string original_contents;
372   ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents));
373   // Get file URL.
374   GURL file_url = net::FilePathToFileURL(page_file_path);
375   ASSERT_TRUE(file_url.SchemeIsFile());
376   // Make sure original contents does not have MOTW;
377   std::string motw_declaration =
378      WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
379   ASSERT_FALSE(motw_declaration.empty());
380   // The encoding of original contents is ISO-8859-1, so we convert the MOTW
381   // declaration to ASCII and search whether original contents has it or not.
382   ASSERT_TRUE(std::string::npos ==
383       original_contents.find(motw_declaration));
384   // Load the test file.
385   LoadPageFromURL(file_url);
386   // Do serialization.
387   SerializeDomForURL(file_url, false);
388   // Make sure the serialized contents have MOTW ;
389   ASSERT_TRUE(HasSerializedFrame(file_url));
390   const std::string& serialized_contents =
391       GetSerializedContentForFrame(file_url);
392   ASSERT_FALSE(std::string::npos ==
393       serialized_contents.find(motw_declaration));
394 }
395 
396 // When serializing DOM, we will add the META which have correct charset
397 // declaration as first child of HEAD element for resolving WebKit bug:
398 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
399 // does not have META charset declaration.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc)400 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
401   FilePath page_file_path = data_dir_;
402   page_file_path = page_file_path.AppendASCII("dom_serializer");
403   page_file_path = page_file_path.AppendASCII("youtube_1.htm");
404   // Get file URL.
405   GURL file_url = net::FilePathToFileURL(page_file_path);
406   ASSERT_TRUE(file_url.SchemeIsFile());
407   // Load the test file.
408   LoadPageFromURL(file_url);
409 
410   // Make sure there is no META charset declaration in original document.
411   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
412   ASSERT_TRUE(web_frame != NULL);
413   WebDocument doc = web_frame->document();
414   ASSERT_TRUE(doc.isHTMLDocument());
415   WebElement head_element = doc.head();
416   ASSERT_TRUE(!head_element.isNull());
417   // Go through all children of HEAD element.
418   for (WebNode child = head_element.firstChild(); !child.isNull();
419        child = child.nextSibling()) {
420     std::string charset_info;
421     if (IsMetaElement(child, charset_info))
422       ASSERT_TRUE(charset_info.empty());
423   }
424   // Do serialization.
425   SerializeDomForURL(file_url, false);
426 
427   // Load the serialized contents.
428   ASSERT_TRUE(HasSerializedFrame(file_url));
429   const std::string& serialized_contents =
430       GetSerializedContentForFrame(file_url);
431   LoadContents(serialized_contents, file_url,
432                web_frame->encoding());
433   // Make sure the first child of HEAD element is META which has charset
434   // declaration in serialized contents.
435   web_frame = test_shell_->webView()->mainFrame();
436   ASSERT_TRUE(web_frame != NULL);
437   doc = web_frame->document();
438   ASSERT_TRUE(doc.isHTMLDocument());
439   head_element = doc.head();
440   ASSERT_TRUE(!head_element.isNull());
441   WebNode meta_node = head_element.firstChild();
442   ASSERT_TRUE(!meta_node.isNull());
443   // Get meta charset info.
444   std::string charset_info2;
445   ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
446   ASSERT_TRUE(!charset_info2.empty());
447   ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8()));
448 
449   // Make sure no more additional META tags which have charset declaration.
450   for (WebNode child = meta_node.nextSibling(); !child.isNull();
451        child = child.nextSibling()) {
452     std::string charset_info;
453     if (IsMetaElement(child, charset_info))
454       ASSERT_TRUE(charset_info.empty());
455   }
456 }
457 
458 // When serializing DOM, if the original document has multiple META charset
459 // declaration, we will add the META which have correct charset declaration
460 // as first child of HEAD element and remove all original META charset
461 // declarations.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc)462 TEST_F(DomSerializerTests,
463        SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
464   FilePath page_file_path = data_dir_;
465   page_file_path = page_file_path.AppendASCII("dom_serializer");
466   page_file_path = page_file_path.AppendASCII("youtube_2.htm");
467   // Get file URL.
468   GURL file_url = net::FilePathToFileURL(page_file_path);
469   ASSERT_TRUE(file_url.SchemeIsFile());
470   // Load the test file.
471   LoadPageFromURL(file_url);
472 
473   // Make sure there are multiple META charset declarations in original
474   // document.
475   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
476   ASSERT_TRUE(web_frame != NULL);
477   WebDocument doc = web_frame->document();
478   ASSERT_TRUE(doc.isHTMLDocument());
479   WebElement head_ele = doc.head();
480   ASSERT_TRUE(!head_ele.isNull());
481   // Go through all children of HEAD element.
482   int charset_declaration_count = 0;
483   for (WebNode child = head_ele.firstChild(); !child.isNull();
484        child = child.nextSibling()) {
485     std::string charset_info;
486     if (IsMetaElement(child, charset_info) && !charset_info.empty())
487       charset_declaration_count++;
488   }
489   // The original doc has more than META tags which have charset declaration.
490   ASSERT_TRUE(charset_declaration_count > 1);
491 
492   // Do serialization.
493   SerializeDomForURL(file_url, false);
494 
495   // Load the serialized contents.
496   ASSERT_TRUE(HasSerializedFrame(file_url));
497   const std::string& serialized_contents =
498       GetSerializedContentForFrame(file_url);
499   LoadContents(serialized_contents, file_url,
500                web_frame->encoding());
501   // Make sure only first child of HEAD element is META which has charset
502   // declaration in serialized contents.
503   web_frame = test_shell_->webView()->mainFrame();
504   ASSERT_TRUE(web_frame != NULL);
505   doc = web_frame->document();
506   ASSERT_TRUE(doc.isHTMLDocument());
507   head_ele = doc.head();
508   ASSERT_TRUE(!head_ele.isNull());
509   WebNode meta_node = head_ele.firstChild();
510   ASSERT_TRUE(!meta_node.isNull());
511   // Get meta charset info.
512   std::string charset_info2;
513   ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
514   ASSERT_TRUE(!charset_info2.empty());
515   ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8()));
516 
517   // Make sure no more additional META tags which have charset declaration.
518   for (WebNode child = meta_node.nextSibling(); !child.isNull();
519        child = child.nextSibling()) {
520     std::string charset_info;
521     if (IsMetaElement(child, charset_info))
522       ASSERT_TRUE(charset_info.empty());
523   }
524 }
525 
526 // Test situation of html entities in text when serializing HTML DOM.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithEntitiesInText)527 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
528   FilePath page_file_path = data_dir_;
529   page_file_path = page_file_path.AppendASCII(
530       "dom_serializer/htmlentities_in_text.htm");
531   // Get file URL. The URL is dummy URL to identify the following loading
532   // actions. The test content is in constant:original_contents.
533   GURL file_url = net::FilePathToFileURL(page_file_path);
534   ASSERT_TRUE(file_url.SchemeIsFile());
535   // Test contents.
536   static const char* const original_contents =
537       "<html><body>&amp;&lt;&gt;\"\'</body></html>";
538   // Load the test contents.
539   LoadContents(original_contents, file_url, WebString());
540 
541   // Get BODY's text content in DOM.
542   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
543   ASSERT_TRUE(web_frame != NULL);
544   WebDocument doc = web_frame->document();
545   ASSERT_TRUE(doc.isHTMLDocument());
546   WebElement body_ele = doc.body();
547   ASSERT_TRUE(!body_ele.isNull());
548   WebNode text_node = body_ele.firstChild();
549   ASSERT_TRUE(text_node.isTextNode());
550   ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) ==
551               "&amp;&lt;&gt;\"\'");
552   // Do serialization.
553   SerializeDomForURL(file_url, false);
554   // Compare the serialized contents with original contents.
555   ASSERT_TRUE(HasSerializedFrame(file_url));
556   const std::string& serialized_contents =
557       GetSerializedContentForFrame(file_url);
558   // Compare the serialized contents with original contents to make sure
559   // they are same.
560   // Because we add MOTW when serializing DOM, so before comparison, we also
561   // need to add MOTW to original_contents.
562   std::string original_str =
563     WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
564   original_str += original_contents;
565   // Since WebCore now inserts a new HEAD element if there is no HEAD element
566   // when creating BODY element. (Please see HTMLParser::bodyCreateErrorCheck.)
567   // We need to append the HEAD content and corresponding META content if we
568   // find WebCore-generated HEAD element.
569   if (!doc.head().isNull()) {
570     WebString encoding = web_frame->encoding();
571     std::string htmlTag("<html>");
572     std::string::size_type pos = original_str.find(htmlTag);
573     ASSERT_NE(std::string::npos, pos);
574     pos += htmlTag.length();
575     std::string head_part("<head>");
576     head_part +=
577         WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
578     head_part += "</head>";
579     original_str.insert(pos, head_part);
580   }
581   ASSERT_EQ(original_str, serialized_contents);
582 }
583 
584 // Test situation of html entities in attribute value when serializing
585 // HTML DOM.
586 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithEntitiesInAttributeValue)587 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInAttributeValue) {
588   FilePath page_file_path = data_dir_;
589   page_file_path = page_file_path.AppendASCII(
590       "dom_serializer/htmlentities_in_attribute_value.htm");
591   // Get file URL. The URL is dummy URL to identify the following loading
592   // actions. The test content is in constant:original_contents.
593   GURL file_url = net::FilePathToFileURL(page_file_path);
594   ASSERT_TRUE(file_url.SchemeIsFile());
595   // Test contents.
596   static const char* const original_contents =
597       "<html><body title=\"&amp;&lt;&gt;&quot;&#39;\"></body></html>";
598   // Load the test contents.
599   LoadContents(original_contents, file_url, WebString());
600   // Get value of BODY's title attribute in DOM.
601   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
602   ASSERT_TRUE(web_frame != NULL);
603   WebDocument doc = web_frame->document();
604   ASSERT_TRUE(doc.isHTMLDocument());
605   WebElement body_ele = doc.body();
606   ASSERT_TRUE(!body_ele.isNull());
607   WebString value = body_ele.getAttribute("title");
608   ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
609   // Do serialization.
610   SerializeDomForURL(file_url, false);
611   // Compare the serialized contents with original contents.
612   ASSERT_TRUE(HasSerializedFrame(file_url));
613   const std::string& serialized_contents =
614       GetSerializedContentForFrame(file_url);
615   // Compare the serialized contents with original contents to make sure
616   // they are same.
617   std::string original_str =
618       WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
619   original_str += original_contents;
620   if (!doc.isNull()) {
621     WebString encoding = web_frame->encoding();
622     std::string htmlTag("<html>");
623     std::string::size_type pos = original_str.find(htmlTag);
624     ASSERT_NE(std::string::npos, pos);
625     pos += htmlTag.length();
626     std::string head_part("<head>");
627     head_part +=
628         WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
629     head_part += "</head>";
630     original_str.insert(pos, head_part);
631   }
632   ASSERT_EQ(original_str, serialized_contents);
633 }
634 
635 // Test situation of non-standard HTML entities when serializing HTML DOM.
636 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithNonStandardEntities)637 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNonStandardEntities) {
638   // Make a test file URL and load it.
639   FilePath page_file_path = data_dir_;
640   page_file_path = page_file_path.AppendASCII("dom_serializer");
641   page_file_path = page_file_path.AppendASCII("nonstandard_htmlentities.htm");
642   GURL file_url = net::FilePathToFileURL(page_file_path);
643   LoadPageFromURL(file_url);
644 
645   // Get value of BODY's title attribute in DOM.
646   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
647   WebDocument doc = web_frame->document();
648   ASSERT_TRUE(doc.isHTMLDocument());
649   WebElement body_element = doc.body();
650   // Unescaped string for "&percnt;&nsup;&sup1;&apos;".
651   static const wchar_t parsed_value[] = {
652     '%', 0x2285, 0x00b9, '\'', 0
653   };
654   WebString value = body_element.getAttribute("title");
655   ASSERT_TRUE(UTF16ToWide(value) == parsed_value);
656   ASSERT_TRUE(UTF16ToWide(body_element.innerText()) == parsed_value);
657 
658   // Do serialization.
659   SerializeDomForURL(file_url, false);
660   // Check the serialized string.
661   ASSERT_TRUE(HasSerializedFrame(file_url));
662   const std::string& serialized_contents =
663       GetSerializedContentForFrame(file_url);
664   // Confirm that the serialized string has no non-standard HTML entities.
665   ASSERT_EQ(std::string::npos, serialized_contents.find("&percnt;"));
666   ASSERT_EQ(std::string::npos, serialized_contents.find("&nsup;"));
667   ASSERT_EQ(std::string::npos, serialized_contents.find("&sup1;"));
668   ASSERT_EQ(std::string::npos, serialized_contents.find("&apos;"));
669 }
670 
671 // Test situation of BASE tag in original document when serializing HTML DOM.
672 // When serializing, we should comment the BASE tag, append a new BASE tag.
673 // rewrite all the savable URLs to relative local path, and change other URLs
674 // to absolute URLs.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithBaseTag)675 TEST_F(DomSerializerTests, SerializeHTMLDOMWithBaseTag) {
676   // There are total 2 available base tags in this test file.
677   const int kTotalBaseTagCountInTestFile = 2;
678 
679   FilePath page_file_path = data_dir_.AppendASCII("dom_serializer");
680   file_util::EnsureEndsWithSeparator(&page_file_path);
681 
682   // Get page dir URL which is base URL of this file.
683   GURL path_dir_url = net::FilePathToFileURL(page_file_path);
684   // Get file path.
685   page_file_path =
686       page_file_path.AppendASCII("html_doc_has_base_tag.htm");
687   // Get file URL.
688   GURL file_url = net::FilePathToFileURL(page_file_path);
689   ASSERT_TRUE(file_url.SchemeIsFile());
690   // Load the test file.
691   LoadPageFromURL(file_url);
692   // Since for this test, we assume there is no savable sub-resource links for
693   // this test file, also all links are relative URLs in this test file, so we
694   // need to check those relative URLs and make sure document has BASE tag.
695   WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
696   ASSERT_TRUE(web_frame != NULL);
697   WebDocument doc = web_frame->document();
698   ASSERT_TRUE(doc.isHTMLDocument());
699   // Go through all descent nodes.
700   WebNodeCollection all = doc.all();
701   int original_base_tag_count = 0;
702   for (WebNode node = all.firstItem(); !node.isNull();
703        node = all.nextItem()) {
704     if (!node.isElementNode())
705       continue;
706     WebElement element = node.to<WebElement>();
707     if (element.hasTagName("base")) {
708       original_base_tag_count++;
709     } else {
710       // Get link.
711       WebString value =
712           webkit_glue::GetSubResourceLinkFromElement(element);
713       if (value.isNull() && element.hasTagName("a")) {
714         value = element.getAttribute("href");
715         if (value.isEmpty())
716           value = WebString();
717       }
718       // Each link is relative link.
719       if (!value.isNull()) {
720         GURL link(value.utf8());
721         ASSERT_TRUE(link.scheme().empty());
722       }
723     }
724   }
725   ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
726   // Make sure in original document, the base URL is not equal with the
727   // |path_dir_url|.
728   GURL original_base_url(doc.baseURL());
729   ASSERT_NE(original_base_url, path_dir_url);
730 
731   // Do serialization.
732   SerializeDomForURL(file_url, false);
733 
734   // Load the serialized contents.
735   ASSERT_TRUE(HasSerializedFrame(file_url));
736   const std::string& serialized_contents =
737       GetSerializedContentForFrame(file_url);
738   LoadContents(serialized_contents, file_url,
739                web_frame->encoding());
740 
741   // Make sure all links are absolute URLs and doc there are some number of
742   // BASE tags in serialized HTML data. Each of those BASE tags have same base
743   // URL which is as same as URL of current test file.
744   web_frame = test_shell_->webView()->mainFrame();
745   ASSERT_TRUE(web_frame != NULL);
746   doc = web_frame->document();
747   ASSERT_TRUE(doc.isHTMLDocument());
748   // Go through all descent nodes.
749   all = doc.all();
750   int new_base_tag_count = 0;
751   for (WebNode node = all.firstItem(); !node.isNull();
752        node = all.nextItem()) {
753     if (!node.isElementNode())
754       continue;
755     WebElement element = node.to<WebElement>();
756     if (element.hasTagName("base")) {
757       new_base_tag_count++;
758     } else {
759       // Get link.
760       WebString value =
761           webkit_glue::GetSubResourceLinkFromElement(element);
762       if (value.isNull() && element.hasTagName("a")) {
763         value = element.getAttribute("href");
764         if (value.isEmpty())
765           value = WebString();
766       }
767       // Each link is absolute link.
768       if (!value.isNull()) {
769         GURL link(std::string(value.utf8()));
770         ASSERT_FALSE(link.scheme().empty());
771       }
772     }
773   }
774   // We have one more added BASE tag which is generated by JavaScript.
775   ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
776   // Make sure in new document, the base URL is equal with the |path_dir_url|.
777   GURL new_base_url(doc.baseURL());
778   ASSERT_EQ(new_base_url, path_dir_url);
779 }
780 
781 // Serializing page which has an empty HEAD tag.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithEmptyHead)782 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
783   FilePath page_file_path = data_dir_;
784   page_file_path = page_file_path.AppendASCII("dom_serializer");
785   page_file_path = page_file_path.AppendASCII("empty_head.htm");
786   GURL file_url = net::FilePathToFileURL(page_file_path);
787   ASSERT_TRUE(file_url.SchemeIsFile());
788 
789   // Load the test html content.
790   static const char* const empty_head_contents =
791     "<html><head></head><body>hello world</body></html>";
792   LoadContents(empty_head_contents, file_url, WebString());
793 
794   // Make sure the head tag is empty.
795   WebFrame* web_frame = test_shell_->webView()->mainFrame();
796   ASSERT_TRUE(web_frame != NULL);
797   WebDocument doc = web_frame->document();
798   ASSERT_TRUE(doc.isHTMLDocument());
799   WebElement head_element = doc.head();
800   ASSERT_TRUE(!head_element.isNull());
801   ASSERT_TRUE(!head_element.hasChildNodes());
802   ASSERT_TRUE(head_element.childNodes().length() == 0);
803 
804   // Do serialization.
805   SerializeDomForURL(file_url, false);
806   // Make sure the serialized contents have META ;
807   ASSERT_TRUE(HasSerializedFrame(file_url));
808   const std::string& serialized_contents =
809       GetSerializedContentForFrame(file_url);
810 
811   // Reload serialized contents and make sure there is only one META tag.
812   LoadContents(serialized_contents, file_url, web_frame->encoding());
813   web_frame = test_shell_->webView()->mainFrame();
814   ASSERT_TRUE(web_frame != NULL);
815   doc = web_frame->document();
816   ASSERT_TRUE(doc.isHTMLDocument());
817   head_element = doc.head();
818   ASSERT_TRUE(!head_element.isNull());
819   ASSERT_TRUE(head_element.hasChildNodes());
820   ASSERT_TRUE(head_element.childNodes().length() == 1);
821   WebNode meta_node = head_element.firstChild();
822   ASSERT_TRUE(!meta_node.isNull());
823   // Get meta charset info.
824   std::string charset_info;
825   ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
826   ASSERT_TRUE(!charset_info.empty());
827   ASSERT_TRUE(charset_info == std::string(web_frame->encoding().utf8()));
828 
829   // Check the body's first node is text node and its contents are
830   // "hello world"
831   WebElement body_element = doc.body();
832   ASSERT_TRUE(!body_element.isNull());
833   WebNode text_node = body_element.firstChild();
834   ASSERT_TRUE(text_node.isTextNode());
835   WebString text_node_contents = text_node.nodeValue();
836   ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
837 }
838 
839 // Test that we don't crash when the page contains an iframe that
840 // was handled as a download (http://crbug.com/42212).
TEST_F(DomSerializerTests,SerializeDocumentWithDownloadedIFrame)841 TEST_F(DomSerializerTests, SerializeDocumentWithDownloadedIFrame) {
842   FilePath page_file_path = data_dir_;
843   page_file_path = page_file_path.AppendASCII("dom_serializer");
844   page_file_path = page_file_path.AppendASCII("iframe-src-is-exe.htm");
845   GURL file_url = net::FilePathToFileURL(page_file_path);
846   ASSERT_TRUE(file_url.SchemeIsFile());
847   // Load the test file.
848   LoadPageFromURL(file_url);
849   // Do a recursive serialization. We pass if we don't crash.
850   SerializeDomForURL(file_url, true);
851 }
852 
853 }  // namespace
854