1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/compiler_specific.h"
6 #include "base/file_path.h"
7 #include "base/file_util.h"
8 #include "base/hash_tables.h"
9 #include "base/string_util.h"
10 #include "base/utf_string_conversions.h"
11 #include "net/base/net_util.h"
12 #include "net/url_request/url_request_context.h"
13 #include "third_party/WebKit/Source/WebKit/chromium/public/WebCString.h"
14 #include "third_party/WebKit/Source/WebKit/chromium/public/WebData.h"
15 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h"
16 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h"
17 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNode.h"
19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h"
20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeList.h"
21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializer.h"
22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebPageSerializerClient.h"
23 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h"
24 #include "third_party/WebKit/Source/WebKit/chromium/public/WebURL.h"
25 #include "third_party/WebKit/Source/WebKit/chromium/public/WebVector.h"
26 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h"
27 #include "webkit/glue/dom_operations.h"
28 #include "webkit/glue/webkit_glue.h"
29 #include "webkit/tools/test_shell/simple_resource_loader_bridge.h"
30 #include "webkit/tools/test_shell/test_shell_test.h"
31
32 using WebKit::WebCString;
33 using WebKit::WebData;
34 using WebKit::WebDocument;
35 using WebKit::WebElement;
36 using WebKit::WebFrame;
37 using WebKit::WebNode;
38 using WebKit::WebNodeCollection;
39 using WebKit::WebNodeList;
40 using WebKit::WebPageSerializer;
41 using WebKit::WebPageSerializerClient;
42 using WebKit::WebNode;
43 using WebKit::WebString;
44 using WebKit::WebURL;
45 using WebKit::WebView;
46 using WebKit::WebVector;
47
48 namespace {
49
50 // Iterate recursively over sub-frames to find one with with a given url.
FindSubFrameByURL(WebView * web_view,const GURL & url)51 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
52 if (!web_view->mainFrame())
53 return NULL;
54
55 std::vector<WebFrame*> stack;
56 stack.push_back(web_view->mainFrame());
57
58 while (!stack.empty()) {
59 WebFrame* current_frame = stack.back();
60 stack.pop_back();
61 if (GURL(current_frame->url()) == url)
62 return current_frame;
63 WebNodeCollection all = current_frame->document().all();
64 for (WebNode node = all.firstItem();
65 !node.isNull(); node = all.nextItem()) {
66 if (!node.isElementNode())
67 continue;
68 // Check frame tag and iframe tag
69 WebElement element = node.to<WebElement>();
70 if (!element.hasTagName("frame") && !element.hasTagName("iframe"))
71 continue;
72 WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
73 if (sub_frame)
74 stack.push_back(sub_frame);
75 }
76 }
77 return NULL;
78 }
79
80 class DomSerializerTests : public TestShellTest,
81 public WebPageSerializerClient {
82 public:
DomSerializerTests()83 DomSerializerTests()
84 : local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) { }
85
86 // DomSerializerDelegate.
didSerializeDataForFrame(const WebURL & frame_web_url,const WebCString & data,PageSerializationStatus status)87 void didSerializeDataForFrame(const WebURL& frame_web_url,
88 const WebCString& data,
89 PageSerializationStatus status) {
90
91 GURL frame_url(frame_web_url);
92 // If the all frames are finished saving, check all finish status
93 if (status == WebPageSerializerClient::AllFramesAreFinished) {
94 SerializationFinishStatusMap::iterator it =
95 serialization_finish_status_.begin();
96 for (; it != serialization_finish_status_.end(); ++it)
97 ASSERT_TRUE(it->second);
98 serialized_ = true;
99 return;
100 }
101
102 // Check finish status of current frame.
103 SerializationFinishStatusMap::iterator it =
104 serialization_finish_status_.find(frame_url.spec());
105 // New frame, set initial status as false.
106 if (it == serialization_finish_status_.end())
107 serialization_finish_status_[frame_url.spec()] = false;
108
109 it = serialization_finish_status_.find(frame_url.spec());
110 ASSERT_TRUE(it != serialization_finish_status_.end());
111 // In process frame, finish status should be false.
112 ASSERT_FALSE(it->second);
113
114 // Add data to corresponding frame's content.
115 serialized_frame_map_[frame_url.spec()] += data.data();
116
117 // Current frame is completed saving, change the finish status.
118 if (status == WebPageSerializerClient::CurrentFrameIsFinished)
119 it->second = true;
120 }
121
HasSerializedFrame(const GURL & frame_url)122 bool HasSerializedFrame(const GURL& frame_url) {
123 return serialized_frame_map_.find(frame_url.spec()) !=
124 serialized_frame_map_.end();
125 }
126
GetSerializedContentForFrame(const GURL & frame_url)127 const std::string& GetSerializedContentForFrame(
128 const GURL& frame_url) {
129 return serialized_frame_map_[frame_url.spec()];
130 }
131
132 // Load web page according to specific URL.
LoadPageFromURL(const GURL & page_url)133 void LoadPageFromURL(const GURL& page_url) {
134 // Load the test file.
135 test_shell_->ResetTestController();
136 test_shell_->LoadURL(page_url);
137 test_shell_->WaitTestFinished();
138 }
139
140 // Load web page according to input content and relative URLs within
141 // the document.
LoadContents(const std::string & contents,const GURL & base_url,const WebString encoding_info)142 void LoadContents(const std::string& contents,
143 const GURL& base_url,
144 const WebString encoding_info) {
145 test_shell_->ResetTestController();
146 // If input encoding is empty, use UTF-8 as default encoding.
147 if (encoding_info.isEmpty()) {
148 test_shell_->webView()->mainFrame()->loadHTMLString(contents, base_url);
149 } else {
150 WebData data(contents.data(), contents.length());
151
152 // Do not use WebFrame.LoadHTMLString because it assumes that input
153 // html contents use UTF-8 encoding.
154 // TODO(darin): This should use WebFrame::loadData.
155 WebFrame* web_frame =
156 test_shell_->webView()->mainFrame();
157
158 ASSERT_TRUE(web_frame != NULL);
159
160 web_frame->loadData(data, "text/html", encoding_info, base_url);
161 }
162
163 test_shell_->WaitTestFinished();
164 }
165
166 // Serialize page DOM according to specific page URL. The parameter
167 // recursive_serialization indicates whether we will serialize all
168 // sub-frames.
SerializeDomForURL(const GURL & page_url,bool recursive_serialization)169 void SerializeDomForURL(const GURL& page_url,
170 bool recursive_serialization) {
171 // Find corresponding WebFrame according to page_url.
172 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(),
173 page_url);
174 ASSERT_TRUE(web_frame != NULL);
175 // Add input file URl to links_.
176 links_.assign(&page_url,1);
177 // Add dummy file path to local_path_.
178 WebString file_path = webkit_glue::FilePathStringToWebString(
179 FILE_PATH_LITERAL("c:\\dummy.htm"));
180 local_paths_.assign(&file_path, 1);
181 // Start serializing DOM.
182 bool result = WebPageSerializer::serialize(web_frame,
183 recursive_serialization,
184 static_cast<WebPageSerializerClient*>(this),
185 links_,
186 local_paths_,
187 webkit_glue::FilePathToWebString(local_directory_name_));
188 ASSERT_TRUE(result);
189 ASSERT_TRUE(serialized_);
190 }
191
192 private:
193 // Map frame_url to corresponding serialized_content.
194 typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
195 SerializedFrameContentMap serialized_frame_map_;
196 // Map frame_url to corresponding status of serialization finish.
197 typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
198 SerializationFinishStatusMap serialization_finish_status_;
199 // Flag indicates whether the process of serializing DOM is finished or not.
200 bool serialized_;
201 // The links_ contain dummy original URLs of all saved links.
202 WebVector<WebURL> links_;
203 // The local_paths_ contain dummy corresponding local file paths of all saved
204 // links, which matched links_ one by one.
205 WebVector<WebString> local_paths_;
206 // The local_directory_name_ is dummy relative path of directory which
207 // contain all saved auxiliary files included all sub frames and resources.
208 const FilePath local_directory_name_;
209
210 protected:
211 // testing::Test
SetUp()212 virtual void SetUp() {
213 TestShellTest::SetUp();
214 serialized_ = false;
215 }
216
TearDown()217 virtual void TearDown() {
218 TestShellTest::TearDown();
219 }
220 };
221
222 // Helper function that test whether the first node in the doc is a doc type
223 // node.
HasDocType(const WebDocument & doc)224 bool HasDocType(const WebDocument& doc) {
225 WebNode node = doc.firstChild();
226 if (node.isNull())
227 return false;
228 return node.nodeType() == WebNode::DocumentTypeNode;
229 }
230
231 // Helper function for checking whether input node is META tag. Return true
232 // means it is META element, otherwise return false. The parameter charset_info
233 // return actual charset info if the META tag has charset declaration.
IsMetaElement(const WebNode & node,std::string & charset_info)234 bool IsMetaElement(const WebNode& node, std::string& charset_info) {
235 if (!node.isElementNode())
236 return false;
237 const WebElement meta = node.toConst<WebElement>();
238 if (!meta.hasTagName("meta"))
239 return false;
240 charset_info.erase(0, charset_info.length());
241 // Check the META charset declaration.
242 WebString httpEquiv = meta.getAttribute("http-equiv");
243 if (LowerCaseEqualsASCII(httpEquiv, "content-type")) {
244 std::string content = meta.getAttribute("content").utf8();
245 int pos = content.find("charset", 0);
246 if (pos > -1) {
247 // Add a dummy charset declaration to charset_info, which indicates this
248 // META tag has charset declaration although we do not get correct value
249 // yet.
250 charset_info.append("has-charset-declaration");
251 int remaining_length = content.length() - pos - 7;
252 if (!remaining_length)
253 return true;
254 int start_pos = pos + 7;
255 // Find "=" symbol.
256 while (remaining_length--)
257 if (content[start_pos++] == L'=')
258 break;
259 // Skip beginning space.
260 while (remaining_length) {
261 if (content[start_pos] > 0x0020)
262 break;
263 ++start_pos;
264 --remaining_length;
265 }
266 if (!remaining_length)
267 return true;
268 int end_pos = start_pos;
269 // Now we find out the start point of charset info. Search the end point.
270 while (remaining_length--) {
271 if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
272 break;
273 ++end_pos;
274 }
275 // Get actual charset info.
276 charset_info = content.substr(start_pos, end_pos - start_pos);
277 return true;
278 }
279 }
280 return true;
281 }
282
283 // If original contents have document type, the serialized contents also have
284 // document type.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithDocType)285 TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) {
286 FilePath page_file_path = data_dir_;
287 page_file_path = page_file_path.AppendASCII("dom_serializer");
288 page_file_path = page_file_path.AppendASCII("youtube_1.htm");
289 GURL file_url = net::FilePathToFileURL(page_file_path);
290 ASSERT_TRUE(file_url.SchemeIsFile());
291 // Load the test file.
292 LoadPageFromURL(file_url);
293 // Make sure original contents have document type.
294 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
295 ASSERT_TRUE(web_frame != NULL);
296 WebDocument doc = web_frame->document();
297 ASSERT_TRUE(HasDocType(doc));
298 // Do serialization.
299 SerializeDomForURL(file_url, false);
300 // Load the serialized contents.
301 ASSERT_TRUE(HasSerializedFrame(file_url));
302 const std::string& serialized_contents =
303 GetSerializedContentForFrame(file_url);
304 LoadContents(serialized_contents, file_url,
305 web_frame->encoding());
306 // Make sure serialized contents still have document type.
307 web_frame = test_shell_->webView()->mainFrame();
308 doc = web_frame->document();
309 ASSERT_TRUE(HasDocType(doc));
310 }
311
312 // If original contents do not have document type, the serialized contents
313 // also do not have document type.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithoutDocType)314 TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
315 FilePath page_file_path = data_dir_;
316 page_file_path = page_file_path.AppendASCII("dom_serializer");
317 page_file_path = page_file_path.AppendASCII("youtube_2.htm");
318 GURL file_url = net::FilePathToFileURL(page_file_path);
319 ASSERT_TRUE(file_url.SchemeIsFile());
320 // Load the test file.
321 LoadPageFromURL(file_url);
322 // Make sure original contents do not have document type.
323 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
324 ASSERT_TRUE(web_frame != NULL);
325 WebDocument doc = web_frame->document();
326 ASSERT_TRUE(!HasDocType(doc));
327 // Do serialization.
328 SerializeDomForURL(file_url, false);
329 // Load the serialized contents.
330 ASSERT_TRUE(HasSerializedFrame(file_url));
331 const std::string& serialized_contents =
332 GetSerializedContentForFrame(file_url);
333 LoadContents(serialized_contents, file_url,
334 web_frame->encoding());
335 // Make sure serialized contents do not have document type.
336 web_frame = test_shell_->webView()->mainFrame();
337 doc = web_frame->document();
338 ASSERT_TRUE(!HasDocType(doc));
339 }
340
341 // Serialize XML document which has all 5 built-in entities. After
342 // finishing serialization, the serialized contents should be same
343 // with original XML document.
TEST_F(DomSerializerTests,SerializeXMLDocWithBuiltInEntities)344 TEST_F(DomSerializerTests, SerializeXMLDocWithBuiltInEntities) {
345 FilePath page_file_path = data_dir_;
346 page_file_path = page_file_path.AppendASCII("dom_serializer");
347 page_file_path = page_file_path.AppendASCII("note.xml");
348 // Read original contents for later comparison.
349 std::string original_contents;
350 ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents));
351 // Get file URL.
352 GURL file_url = net::FilePathToFileURL(page_file_path);
353 ASSERT_TRUE(file_url.SchemeIsFile());
354 // Load the test file.
355 LoadPageFromURL(file_url);
356 // Do serialization.
357 SerializeDomForURL(file_url, false);
358 // Compare the serialized contents with original contents.
359 ASSERT_TRUE(HasSerializedFrame(file_url));
360 const std::string& serialized_contents =
361 GetSerializedContentForFrame(file_url);
362 ASSERT_EQ(original_contents, serialized_contents);
363 }
364
365 // When serializing DOM, we add MOTW declaration before html tag.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithAddingMOTW)366 TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
367 FilePath page_file_path = data_dir_;
368 page_file_path = page_file_path.AppendASCII("dom_serializer");
369 page_file_path = page_file_path.AppendASCII("youtube_2.htm");
370 // Read original contents for later comparison .
371 std::string original_contents;
372 ASSERT_TRUE(file_util::ReadFileToString(page_file_path, &original_contents));
373 // Get file URL.
374 GURL file_url = net::FilePathToFileURL(page_file_path);
375 ASSERT_TRUE(file_url.SchemeIsFile());
376 // Make sure original contents does not have MOTW;
377 std::string motw_declaration =
378 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
379 ASSERT_FALSE(motw_declaration.empty());
380 // The encoding of original contents is ISO-8859-1, so we convert the MOTW
381 // declaration to ASCII and search whether original contents has it or not.
382 ASSERT_TRUE(std::string::npos ==
383 original_contents.find(motw_declaration));
384 // Load the test file.
385 LoadPageFromURL(file_url);
386 // Do serialization.
387 SerializeDomForURL(file_url, false);
388 // Make sure the serialized contents have MOTW ;
389 ASSERT_TRUE(HasSerializedFrame(file_url));
390 const std::string& serialized_contents =
391 GetSerializedContentForFrame(file_url);
392 ASSERT_FALSE(std::string::npos ==
393 serialized_contents.find(motw_declaration));
394 }
395
396 // When serializing DOM, we will add the META which have correct charset
397 // declaration as first child of HEAD element for resolving WebKit bug:
398 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
399 // does not have META charset declaration.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc)400 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
401 FilePath page_file_path = data_dir_;
402 page_file_path = page_file_path.AppendASCII("dom_serializer");
403 page_file_path = page_file_path.AppendASCII("youtube_1.htm");
404 // Get file URL.
405 GURL file_url = net::FilePathToFileURL(page_file_path);
406 ASSERT_TRUE(file_url.SchemeIsFile());
407 // Load the test file.
408 LoadPageFromURL(file_url);
409
410 // Make sure there is no META charset declaration in original document.
411 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
412 ASSERT_TRUE(web_frame != NULL);
413 WebDocument doc = web_frame->document();
414 ASSERT_TRUE(doc.isHTMLDocument());
415 WebElement head_element = doc.head();
416 ASSERT_TRUE(!head_element.isNull());
417 // Go through all children of HEAD element.
418 for (WebNode child = head_element.firstChild(); !child.isNull();
419 child = child.nextSibling()) {
420 std::string charset_info;
421 if (IsMetaElement(child, charset_info))
422 ASSERT_TRUE(charset_info.empty());
423 }
424 // Do serialization.
425 SerializeDomForURL(file_url, false);
426
427 // Load the serialized contents.
428 ASSERT_TRUE(HasSerializedFrame(file_url));
429 const std::string& serialized_contents =
430 GetSerializedContentForFrame(file_url);
431 LoadContents(serialized_contents, file_url,
432 web_frame->encoding());
433 // Make sure the first child of HEAD element is META which has charset
434 // declaration in serialized contents.
435 web_frame = test_shell_->webView()->mainFrame();
436 ASSERT_TRUE(web_frame != NULL);
437 doc = web_frame->document();
438 ASSERT_TRUE(doc.isHTMLDocument());
439 head_element = doc.head();
440 ASSERT_TRUE(!head_element.isNull());
441 WebNode meta_node = head_element.firstChild();
442 ASSERT_TRUE(!meta_node.isNull());
443 // Get meta charset info.
444 std::string charset_info2;
445 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
446 ASSERT_TRUE(!charset_info2.empty());
447 ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8()));
448
449 // Make sure no more additional META tags which have charset declaration.
450 for (WebNode child = meta_node.nextSibling(); !child.isNull();
451 child = child.nextSibling()) {
452 std::string charset_info;
453 if (IsMetaElement(child, charset_info))
454 ASSERT_TRUE(charset_info.empty());
455 }
456 }
457
458 // When serializing DOM, if the original document has multiple META charset
459 // declaration, we will add the META which have correct charset declaration
460 // as first child of HEAD element and remove all original META charset
461 // declarations.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc)462 TEST_F(DomSerializerTests,
463 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
464 FilePath page_file_path = data_dir_;
465 page_file_path = page_file_path.AppendASCII("dom_serializer");
466 page_file_path = page_file_path.AppendASCII("youtube_2.htm");
467 // Get file URL.
468 GURL file_url = net::FilePathToFileURL(page_file_path);
469 ASSERT_TRUE(file_url.SchemeIsFile());
470 // Load the test file.
471 LoadPageFromURL(file_url);
472
473 // Make sure there are multiple META charset declarations in original
474 // document.
475 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
476 ASSERT_TRUE(web_frame != NULL);
477 WebDocument doc = web_frame->document();
478 ASSERT_TRUE(doc.isHTMLDocument());
479 WebElement head_ele = doc.head();
480 ASSERT_TRUE(!head_ele.isNull());
481 // Go through all children of HEAD element.
482 int charset_declaration_count = 0;
483 for (WebNode child = head_ele.firstChild(); !child.isNull();
484 child = child.nextSibling()) {
485 std::string charset_info;
486 if (IsMetaElement(child, charset_info) && !charset_info.empty())
487 charset_declaration_count++;
488 }
489 // The original doc has more than META tags which have charset declaration.
490 ASSERT_TRUE(charset_declaration_count > 1);
491
492 // Do serialization.
493 SerializeDomForURL(file_url, false);
494
495 // Load the serialized contents.
496 ASSERT_TRUE(HasSerializedFrame(file_url));
497 const std::string& serialized_contents =
498 GetSerializedContentForFrame(file_url);
499 LoadContents(serialized_contents, file_url,
500 web_frame->encoding());
501 // Make sure only first child of HEAD element is META which has charset
502 // declaration in serialized contents.
503 web_frame = test_shell_->webView()->mainFrame();
504 ASSERT_TRUE(web_frame != NULL);
505 doc = web_frame->document();
506 ASSERT_TRUE(doc.isHTMLDocument());
507 head_ele = doc.head();
508 ASSERT_TRUE(!head_ele.isNull());
509 WebNode meta_node = head_ele.firstChild();
510 ASSERT_TRUE(!meta_node.isNull());
511 // Get meta charset info.
512 std::string charset_info2;
513 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
514 ASSERT_TRUE(!charset_info2.empty());
515 ASSERT_TRUE(charset_info2 == std::string(web_frame->encoding().utf8()));
516
517 // Make sure no more additional META tags which have charset declaration.
518 for (WebNode child = meta_node.nextSibling(); !child.isNull();
519 child = child.nextSibling()) {
520 std::string charset_info;
521 if (IsMetaElement(child, charset_info))
522 ASSERT_TRUE(charset_info.empty());
523 }
524 }
525
526 // Test situation of html entities in text when serializing HTML DOM.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithEntitiesInText)527 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
528 FilePath page_file_path = data_dir_;
529 page_file_path = page_file_path.AppendASCII(
530 "dom_serializer/htmlentities_in_text.htm");
531 // Get file URL. The URL is dummy URL to identify the following loading
532 // actions. The test content is in constant:original_contents.
533 GURL file_url = net::FilePathToFileURL(page_file_path);
534 ASSERT_TRUE(file_url.SchemeIsFile());
535 // Test contents.
536 static const char* const original_contents =
537 "<html><body>&<>\"\'</body></html>";
538 // Load the test contents.
539 LoadContents(original_contents, file_url, WebString());
540
541 // Get BODY's text content in DOM.
542 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
543 ASSERT_TRUE(web_frame != NULL);
544 WebDocument doc = web_frame->document();
545 ASSERT_TRUE(doc.isHTMLDocument());
546 WebElement body_ele = doc.body();
547 ASSERT_TRUE(!body_ele.isNull());
548 WebNode text_node = body_ele.firstChild();
549 ASSERT_TRUE(text_node.isTextNode());
550 ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) ==
551 "&<>\"\'");
552 // Do serialization.
553 SerializeDomForURL(file_url, false);
554 // Compare the serialized contents with original contents.
555 ASSERT_TRUE(HasSerializedFrame(file_url));
556 const std::string& serialized_contents =
557 GetSerializedContentForFrame(file_url);
558 // Compare the serialized contents with original contents to make sure
559 // they are same.
560 // Because we add MOTW when serializing DOM, so before comparison, we also
561 // need to add MOTW to original_contents.
562 std::string original_str =
563 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
564 original_str += original_contents;
565 // Since WebCore now inserts a new HEAD element if there is no HEAD element
566 // when creating BODY element. (Please see HTMLParser::bodyCreateErrorCheck.)
567 // We need to append the HEAD content and corresponding META content if we
568 // find WebCore-generated HEAD element.
569 if (!doc.head().isNull()) {
570 WebString encoding = web_frame->encoding();
571 std::string htmlTag("<html>");
572 std::string::size_type pos = original_str.find(htmlTag);
573 ASSERT_NE(std::string::npos, pos);
574 pos += htmlTag.length();
575 std::string head_part("<head>");
576 head_part +=
577 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
578 head_part += "</head>";
579 original_str.insert(pos, head_part);
580 }
581 ASSERT_EQ(original_str, serialized_contents);
582 }
583
584 // Test situation of html entities in attribute value when serializing
585 // HTML DOM.
586 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithEntitiesInAttributeValue)587 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInAttributeValue) {
588 FilePath page_file_path = data_dir_;
589 page_file_path = page_file_path.AppendASCII(
590 "dom_serializer/htmlentities_in_attribute_value.htm");
591 // Get file URL. The URL is dummy URL to identify the following loading
592 // actions. The test content is in constant:original_contents.
593 GURL file_url = net::FilePathToFileURL(page_file_path);
594 ASSERT_TRUE(file_url.SchemeIsFile());
595 // Test contents.
596 static const char* const original_contents =
597 "<html><body title=\"&<>"'\"></body></html>";
598 // Load the test contents.
599 LoadContents(original_contents, file_url, WebString());
600 // Get value of BODY's title attribute in DOM.
601 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
602 ASSERT_TRUE(web_frame != NULL);
603 WebDocument doc = web_frame->document();
604 ASSERT_TRUE(doc.isHTMLDocument());
605 WebElement body_ele = doc.body();
606 ASSERT_TRUE(!body_ele.isNull());
607 WebString value = body_ele.getAttribute("title");
608 ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
609 // Do serialization.
610 SerializeDomForURL(file_url, false);
611 // Compare the serialized contents with original contents.
612 ASSERT_TRUE(HasSerializedFrame(file_url));
613 const std::string& serialized_contents =
614 GetSerializedContentForFrame(file_url);
615 // Compare the serialized contents with original contents to make sure
616 // they are same.
617 std::string original_str =
618 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
619 original_str += original_contents;
620 if (!doc.isNull()) {
621 WebString encoding = web_frame->encoding();
622 std::string htmlTag("<html>");
623 std::string::size_type pos = original_str.find(htmlTag);
624 ASSERT_NE(std::string::npos, pos);
625 pos += htmlTag.length();
626 std::string head_part("<head>");
627 head_part +=
628 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
629 head_part += "</head>";
630 original_str.insert(pos, head_part);
631 }
632 ASSERT_EQ(original_str, serialized_contents);
633 }
634
635 // Test situation of non-standard HTML entities when serializing HTML DOM.
636 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithNonStandardEntities)637 TEST_F(DomSerializerTests, SerializeHTMLDOMWithNonStandardEntities) {
638 // Make a test file URL and load it.
639 FilePath page_file_path = data_dir_;
640 page_file_path = page_file_path.AppendASCII("dom_serializer");
641 page_file_path = page_file_path.AppendASCII("nonstandard_htmlentities.htm");
642 GURL file_url = net::FilePathToFileURL(page_file_path);
643 LoadPageFromURL(file_url);
644
645 // Get value of BODY's title attribute in DOM.
646 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
647 WebDocument doc = web_frame->document();
648 ASSERT_TRUE(doc.isHTMLDocument());
649 WebElement body_element = doc.body();
650 // Unescaped string for "%⊅¹'".
651 static const wchar_t parsed_value[] = {
652 '%', 0x2285, 0x00b9, '\'', 0
653 };
654 WebString value = body_element.getAttribute("title");
655 ASSERT_TRUE(UTF16ToWide(value) == parsed_value);
656 ASSERT_TRUE(UTF16ToWide(body_element.innerText()) == parsed_value);
657
658 // Do serialization.
659 SerializeDomForURL(file_url, false);
660 // Check the serialized string.
661 ASSERT_TRUE(HasSerializedFrame(file_url));
662 const std::string& serialized_contents =
663 GetSerializedContentForFrame(file_url);
664 // Confirm that the serialized string has no non-standard HTML entities.
665 ASSERT_EQ(std::string::npos, serialized_contents.find("%"));
666 ASSERT_EQ(std::string::npos, serialized_contents.find("⊅"));
667 ASSERT_EQ(std::string::npos, serialized_contents.find("¹"));
668 ASSERT_EQ(std::string::npos, serialized_contents.find("'"));
669 }
670
671 // Test situation of BASE tag in original document when serializing HTML DOM.
672 // When serializing, we should comment the BASE tag, append a new BASE tag.
673 // rewrite all the savable URLs to relative local path, and change other URLs
674 // to absolute URLs.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithBaseTag)675 TEST_F(DomSerializerTests, SerializeHTMLDOMWithBaseTag) {
676 // There are total 2 available base tags in this test file.
677 const int kTotalBaseTagCountInTestFile = 2;
678
679 FilePath page_file_path = data_dir_.AppendASCII("dom_serializer");
680 file_util::EnsureEndsWithSeparator(&page_file_path);
681
682 // Get page dir URL which is base URL of this file.
683 GURL path_dir_url = net::FilePathToFileURL(page_file_path);
684 // Get file path.
685 page_file_path =
686 page_file_path.AppendASCII("html_doc_has_base_tag.htm");
687 // Get file URL.
688 GURL file_url = net::FilePathToFileURL(page_file_path);
689 ASSERT_TRUE(file_url.SchemeIsFile());
690 // Load the test file.
691 LoadPageFromURL(file_url);
692 // Since for this test, we assume there is no savable sub-resource links for
693 // this test file, also all links are relative URLs in this test file, so we
694 // need to check those relative URLs and make sure document has BASE tag.
695 WebFrame* web_frame = FindSubFrameByURL(test_shell_->webView(), file_url);
696 ASSERT_TRUE(web_frame != NULL);
697 WebDocument doc = web_frame->document();
698 ASSERT_TRUE(doc.isHTMLDocument());
699 // Go through all descent nodes.
700 WebNodeCollection all = doc.all();
701 int original_base_tag_count = 0;
702 for (WebNode node = all.firstItem(); !node.isNull();
703 node = all.nextItem()) {
704 if (!node.isElementNode())
705 continue;
706 WebElement element = node.to<WebElement>();
707 if (element.hasTagName("base")) {
708 original_base_tag_count++;
709 } else {
710 // Get link.
711 WebString value =
712 webkit_glue::GetSubResourceLinkFromElement(element);
713 if (value.isNull() && element.hasTagName("a")) {
714 value = element.getAttribute("href");
715 if (value.isEmpty())
716 value = WebString();
717 }
718 // Each link is relative link.
719 if (!value.isNull()) {
720 GURL link(value.utf8());
721 ASSERT_TRUE(link.scheme().empty());
722 }
723 }
724 }
725 ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
726 // Make sure in original document, the base URL is not equal with the
727 // |path_dir_url|.
728 GURL original_base_url(doc.baseURL());
729 ASSERT_NE(original_base_url, path_dir_url);
730
731 // Do serialization.
732 SerializeDomForURL(file_url, false);
733
734 // Load the serialized contents.
735 ASSERT_TRUE(HasSerializedFrame(file_url));
736 const std::string& serialized_contents =
737 GetSerializedContentForFrame(file_url);
738 LoadContents(serialized_contents, file_url,
739 web_frame->encoding());
740
741 // Make sure all links are absolute URLs and doc there are some number of
742 // BASE tags in serialized HTML data. Each of those BASE tags have same base
743 // URL which is as same as URL of current test file.
744 web_frame = test_shell_->webView()->mainFrame();
745 ASSERT_TRUE(web_frame != NULL);
746 doc = web_frame->document();
747 ASSERT_TRUE(doc.isHTMLDocument());
748 // Go through all descent nodes.
749 all = doc.all();
750 int new_base_tag_count = 0;
751 for (WebNode node = all.firstItem(); !node.isNull();
752 node = all.nextItem()) {
753 if (!node.isElementNode())
754 continue;
755 WebElement element = node.to<WebElement>();
756 if (element.hasTagName("base")) {
757 new_base_tag_count++;
758 } else {
759 // Get link.
760 WebString value =
761 webkit_glue::GetSubResourceLinkFromElement(element);
762 if (value.isNull() && element.hasTagName("a")) {
763 value = element.getAttribute("href");
764 if (value.isEmpty())
765 value = WebString();
766 }
767 // Each link is absolute link.
768 if (!value.isNull()) {
769 GURL link(std::string(value.utf8()));
770 ASSERT_FALSE(link.scheme().empty());
771 }
772 }
773 }
774 // We have one more added BASE tag which is generated by JavaScript.
775 ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
776 // Make sure in new document, the base URL is equal with the |path_dir_url|.
777 GURL new_base_url(doc.baseURL());
778 ASSERT_EQ(new_base_url, path_dir_url);
779 }
780
781 // Serializing page which has an empty HEAD tag.
TEST_F(DomSerializerTests,SerializeHTMLDOMWithEmptyHead)782 TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
783 FilePath page_file_path = data_dir_;
784 page_file_path = page_file_path.AppendASCII("dom_serializer");
785 page_file_path = page_file_path.AppendASCII("empty_head.htm");
786 GURL file_url = net::FilePathToFileURL(page_file_path);
787 ASSERT_TRUE(file_url.SchemeIsFile());
788
789 // Load the test html content.
790 static const char* const empty_head_contents =
791 "<html><head></head><body>hello world</body></html>";
792 LoadContents(empty_head_contents, file_url, WebString());
793
794 // Make sure the head tag is empty.
795 WebFrame* web_frame = test_shell_->webView()->mainFrame();
796 ASSERT_TRUE(web_frame != NULL);
797 WebDocument doc = web_frame->document();
798 ASSERT_TRUE(doc.isHTMLDocument());
799 WebElement head_element = doc.head();
800 ASSERT_TRUE(!head_element.isNull());
801 ASSERT_TRUE(!head_element.hasChildNodes());
802 ASSERT_TRUE(head_element.childNodes().length() == 0);
803
804 // Do serialization.
805 SerializeDomForURL(file_url, false);
806 // Make sure the serialized contents have META ;
807 ASSERT_TRUE(HasSerializedFrame(file_url));
808 const std::string& serialized_contents =
809 GetSerializedContentForFrame(file_url);
810
811 // Reload serialized contents and make sure there is only one META tag.
812 LoadContents(serialized_contents, file_url, web_frame->encoding());
813 web_frame = test_shell_->webView()->mainFrame();
814 ASSERT_TRUE(web_frame != NULL);
815 doc = web_frame->document();
816 ASSERT_TRUE(doc.isHTMLDocument());
817 head_element = doc.head();
818 ASSERT_TRUE(!head_element.isNull());
819 ASSERT_TRUE(head_element.hasChildNodes());
820 ASSERT_TRUE(head_element.childNodes().length() == 1);
821 WebNode meta_node = head_element.firstChild();
822 ASSERT_TRUE(!meta_node.isNull());
823 // Get meta charset info.
824 std::string charset_info;
825 ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
826 ASSERT_TRUE(!charset_info.empty());
827 ASSERT_TRUE(charset_info == std::string(web_frame->encoding().utf8()));
828
829 // Check the body's first node is text node and its contents are
830 // "hello world"
831 WebElement body_element = doc.body();
832 ASSERT_TRUE(!body_element.isNull());
833 WebNode text_node = body_element.firstChild();
834 ASSERT_TRUE(text_node.isTextNode());
835 WebString text_node_contents = text_node.nodeValue();
836 ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
837 }
838
839 // Test that we don't crash when the page contains an iframe that
840 // was handled as a download (http://crbug.com/42212).
TEST_F(DomSerializerTests,SerializeDocumentWithDownloadedIFrame)841 TEST_F(DomSerializerTests, SerializeDocumentWithDownloadedIFrame) {
842 FilePath page_file_path = data_dir_;
843 page_file_path = page_file_path.AppendASCII("dom_serializer");
844 page_file_path = page_file_path.AppendASCII("iframe-src-is-exe.htm");
845 GURL file_url = net::FilePathToFileURL(page_file_path);
846 ASSERT_TRUE(file_url.SchemeIsFile());
847 // Load the test file.
848 LoadPageFromURL(file_url);
849 // Do a recursive serialization. We pass if we don't crash.
850 SerializeDomForURL(file_url, true);
851 }
852
853 } // namespace
854