• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/utility/importer/bookmark_html_reader.h"
6 
7 #include "base/callback.h"
8 #include "base/files/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/time/time.h"
14 #include "chrome/common/importer/imported_bookmark_entry.h"
15 #include "chrome/common/importer/imported_favicon_usage.h"
16 #include "chrome/utility/importer/favicon_reencode.h"
17 #include "net/base/data_url.h"
18 #include "net/base/escape.h"
19 #include "url/gurl.h"
20 #include "url/url_constants.h"
21 
22 namespace {
23 
24 // Fetches the given |attribute| value from the |attribute_list|. Returns true
25 // if successful, and |value| will contain the value.
GetAttribute(const std::string & attribute_list,const std::string & attribute,std::string * value)26 bool GetAttribute(const std::string& attribute_list,
27                   const std::string& attribute,
28                   std::string* value) {
29   const char kQuote[] = "\"";
30 
31   size_t begin = attribute_list.find(attribute + "=" + kQuote);
32   if (begin == std::string::npos)
33     return false;  // Can't find the attribute.
34 
35   begin += attribute.size() + 2;
36   size_t end = begin + 1;
37 
38   while (end < attribute_list.size()) {
39     if (attribute_list[end] == '"' &&
40         attribute_list[end - 1] != '\\') {
41       break;
42     }
43     end++;
44   }
45 
46   if (end == attribute_list.size())
47     return false;  // The value is not quoted.
48 
49   *value = attribute_list.substr(begin, end - begin);
50   return true;
51 }
52 
53 // Given the URL of a page and a favicon data URL, adds an appropriate record
54 // to the given favicon usage vector.
DataURLToFaviconUsage(const GURL & link_url,const GURL & favicon_data,std::vector<ImportedFaviconUsage> * favicons)55 void DataURLToFaviconUsage(
56     const GURL& link_url,
57     const GURL& favicon_data,
58     std::vector<ImportedFaviconUsage>* favicons) {
59   if (!link_url.is_valid() || !favicon_data.is_valid() ||
60       !favicon_data.SchemeIs(url::kDataScheme))
61     return;
62 
63   // Parse the data URL.
64   std::string mime_type, char_set, data;
65   if (!net::DataURL::Parse(favicon_data, &mime_type, &char_set, &data) ||
66       data.empty())
67     return;
68 
69   ImportedFaviconUsage usage;
70   if (!importer::ReencodeFavicon(
71           reinterpret_cast<const unsigned char*>(&data[0]),
72           data.size(), &usage.png_data))
73     return;  // Unable to decode.
74 
75   // We need to make up a URL for the favicon. We use a version of the page's
76   // URL so that we can be sure it will not collide.
77   usage.favicon_url = GURL(std::string("made-up-favicon:") + link_url.spec());
78 
79   // We only have one URL per favicon for Firefox 2 bookmarks.
80   usage.urls.insert(link_url);
81 
82   favicons->push_back(usage);
83 }
84 
85 }  // namespace
86 
87 namespace bookmark_html_reader {
88 
ImportBookmarksFile(const base::Callback<bool (void)> & cancellation_callback,const base::Callback<bool (const GURL &)> & valid_url_callback,const base::FilePath & file_path,std::vector<ImportedBookmarkEntry> * bookmarks,std::vector<ImportedFaviconUsage> * favicons)89 void ImportBookmarksFile(
90       const base::Callback<bool(void)>& cancellation_callback,
91       const base::Callback<bool(const GURL&)>& valid_url_callback,
92       const base::FilePath& file_path,
93       std::vector<ImportedBookmarkEntry>* bookmarks,
94       std::vector<ImportedFaviconUsage>* favicons) {
95   std::string content;
96   base::ReadFileToString(file_path, &content);
97   std::vector<std::string> lines;
98   base::SplitString(content, '\n', &lines);
99 
100   base::string16 last_folder;
101   bool last_folder_on_toolbar = false;
102   bool last_folder_is_empty = true;
103   bool has_subfolder = false;
104   base::Time last_folder_add_date;
105   std::vector<base::string16> path;
106   size_t toolbar_folder_index = 0;
107   std::string charset;
108   for (size_t i = 0;
109        i < lines.size() &&
110            (cancellation_callback.is_null() || !cancellation_callback.Run());
111        ++i) {
112     std::string line;
113     base::TrimString(lines[i], " ", &line);
114 
115     // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
116     // separator in Firefox that Chrome does not support. Note that there can be
117     // multiple "<HR>" tags at the beginning of a single line.
118     // See http://crbug.com/257474.
119     static const char kHrTag[] = "<HR>";
120     while (StartsWithASCII(line, kHrTag, false)) {
121       line.erase(0, arraysize(kHrTag) - 1);
122       base::TrimString(line, " ", &line);
123     }
124 
125     // Get the encoding of the bookmark file.
126     if (internal::ParseCharsetFromLine(line, &charset))
127       continue;
128 
129     // Get the folder name.
130     if (internal::ParseFolderNameFromLine(line,
131                                           charset,
132                                           &last_folder,
133                                           &last_folder_on_toolbar,
134                                           &last_folder_add_date)) {
135       continue;
136     }
137 
138     // Get the bookmark entry.
139     base::string16 title;
140     base::string16 shortcut;
141     GURL url, favicon;
142     base::Time add_date;
143     base::string16 post_data;
144     bool is_bookmark;
145     // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
146     //                keywords yet.
147     is_bookmark =
148         internal::ParseBookmarkFromLine(line, charset, &title,
149                                         &url, &favicon, &shortcut,
150                                         &add_date, &post_data) ||
151         internal::ParseMinimumBookmarkFromLine(line, charset, &title, &url);
152 
153     if (is_bookmark)
154       last_folder_is_empty = false;
155 
156     if (is_bookmark &&
157         post_data.empty() &&
158         (valid_url_callback.is_null() || valid_url_callback.Run(url))) {
159       if (toolbar_folder_index > path.size() && !path.empty()) {
160         NOTREACHED();  // error in parsing.
161         break;
162       }
163 
164       ImportedBookmarkEntry entry;
165       entry.creation_time = add_date;
166       entry.url = url;
167       entry.title = title;
168 
169       if (toolbar_folder_index) {
170         // The toolbar folder should be at the top level.
171         entry.in_toolbar = true;
172         entry.path.assign(path.begin() + toolbar_folder_index - 1, path.end());
173       } else {
174         // Add this bookmark to the list of |bookmarks|.
175         if (!has_subfolder && !last_folder.empty()) {
176           path.push_back(last_folder);
177           last_folder.clear();
178         }
179         entry.path.assign(path.begin(), path.end());
180       }
181       bookmarks->push_back(entry);
182 
183       // Save the favicon. DataURLToFaviconUsage will handle the case where
184       // there is no favicon.
185       if (favicons)
186         DataURLToFaviconUsage(url, favicon, favicons);
187 
188       continue;
189     }
190 
191     // Bookmarks in sub-folder are encapsulated with <DL> tag.
192     if (StartsWithASCII(line, "<DL>", false)) {
193       has_subfolder = true;
194       if (!last_folder.empty()) {
195         path.push_back(last_folder);
196         last_folder.clear();
197       }
198       if (last_folder_on_toolbar && !toolbar_folder_index)
199         toolbar_folder_index = path.size();
200 
201       // Mark next folder empty as initial state.
202       last_folder_is_empty = true;
203     } else if (StartsWithASCII(line, "</DL>", false)) {
204       if (path.empty())
205         break;  // Mismatch <DL>.
206 
207       base::string16 folder_title = path.back();
208       path.pop_back();
209 
210       if (last_folder_is_empty) {
211         // Empty folder should be added explicitly.
212         ImportedBookmarkEntry entry;
213         entry.is_folder = true;
214         entry.creation_time = last_folder_add_date;
215         entry.title = folder_title;
216         if (toolbar_folder_index) {
217           // The toolbar folder should be at the top level.
218           // Make sure we don't add the toolbar folder itself if it is empty.
219           if (toolbar_folder_index <= path.size()) {
220             entry.in_toolbar = true;
221             entry.path.assign(path.begin() + toolbar_folder_index - 1,
222                               path.end());
223             bookmarks->push_back(entry);
224           }
225         } else {
226           // Add this folder to the list of |bookmarks|.
227           entry.path.assign(path.begin(), path.end());
228           bookmarks->push_back(entry);
229         }
230 
231         // Parent folder include current one, so it's not empty.
232         last_folder_is_empty = false;
233       }
234 
235       if (toolbar_folder_index > path.size())
236         toolbar_folder_index = 0;
237     }
238   }
239 }
240 
241 namespace internal {
242 
ParseCharsetFromLine(const std::string & line,std::string * charset)243 bool ParseCharsetFromLine(const std::string& line, std::string* charset) {
244   const char kCharset[] = "charset=";
245   if (StartsWithASCII(line, "<META", false) &&
246       (line.find("CONTENT=\"") != std::string::npos ||
247           line.find("content=\"") != std::string::npos)) {
248     size_t begin = line.find(kCharset);
249     if (begin == std::string::npos)
250       return false;
251     begin += std::string(kCharset).size();
252     size_t end = line.find_first_of('\"', begin);
253     *charset = line.substr(begin, end - begin);
254     return true;
255   }
256   return false;
257 }
258 
ParseFolderNameFromLine(const std::string & line,const std::string & charset,base::string16 * folder_name,bool * is_toolbar_folder,base::Time * add_date)259 bool ParseFolderNameFromLine(const std::string& line,
260                              const std::string& charset,
261                              base::string16* folder_name,
262                              bool* is_toolbar_folder,
263                              base::Time* add_date) {
264   const char kFolderOpen[] = "<DT><H3";
265   const char kFolderClose[] = "</H3>";
266   const char kToolbarFolderAttribute[] = "PERSONAL_TOOLBAR_FOLDER";
267   const char kAddDateAttribute[] = "ADD_DATE";
268 
269   if (!StartsWithASCII(line, kFolderOpen, true))
270     return false;
271 
272   size_t end = line.find(kFolderClose);
273   size_t tag_end = line.rfind('>', end) + 1;
274   // If no end tag or start tag is broken, we skip to find the folder name.
275   if (end == std::string::npos || tag_end < arraysize(kFolderOpen))
276     return false;
277 
278   base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
279                         base::OnStringConversionError::SKIP, folder_name);
280   *folder_name = net::UnescapeForHTML(*folder_name);
281 
282   std::string attribute_list = line.substr(arraysize(kFolderOpen),
283       tag_end - arraysize(kFolderOpen) - 1);
284   std::string value;
285 
286   // Add date
287   if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
288     int64 time;
289     base::StringToInt64(value, &time);
290     // Upper bound it at 32 bits.
291     if (0 < time && time < (1LL << 32))
292       *add_date = base::Time::FromTimeT(time);
293   }
294 
295   if (GetAttribute(attribute_list, kToolbarFolderAttribute, &value) &&
296       LowerCaseEqualsASCII(value, "true"))
297     *is_toolbar_folder = true;
298   else
299     *is_toolbar_folder = false;
300 
301   return true;
302 }
303 
ParseBookmarkFromLine(const std::string & line,const std::string & charset,base::string16 * title,GURL * url,GURL * favicon,base::string16 * shortcut,base::Time * add_date,base::string16 * post_data)304 bool ParseBookmarkFromLine(const std::string& line,
305                            const std::string& charset,
306                            base::string16* title,
307                            GURL* url,
308                            GURL* favicon,
309                            base::string16* shortcut,
310                            base::Time* add_date,
311                            base::string16* post_data) {
312   const char kItemOpen[] = "<DT><A";
313   const char kItemClose[] = "</A>";
314   const char kFeedURLAttribute[] = "FEEDURL";
315   const char kHrefAttribute[] = "HREF";
316   const char kIconAttribute[] = "ICON";
317   const char kShortcutURLAttribute[] = "SHORTCUTURL";
318   const char kAddDateAttribute[] = "ADD_DATE";
319   const char kPostDataAttribute[] = "POST_DATA";
320 
321   title->clear();
322   *url = GURL();
323   *favicon = GURL();
324   shortcut->clear();
325   post_data->clear();
326   *add_date = base::Time();
327 
328   if (!StartsWithASCII(line, kItemOpen, true))
329     return false;
330 
331   size_t end = line.find(kItemClose);
332   size_t tag_end = line.rfind('>', end) + 1;
333   if (end == std::string::npos || tag_end < arraysize(kItemOpen))
334     return false;  // No end tag or start tag is broken.
335 
336   std::string attribute_list = line.substr(arraysize(kItemOpen),
337       tag_end - arraysize(kItemOpen) - 1);
338 
339   // We don't import Live Bookmark folders, which is Firefox's RSS reading
340   // feature, since the user never necessarily bookmarked them and we don't
341   // have this feature to update their contents.
342   std::string value;
343   if (GetAttribute(attribute_list, kFeedURLAttribute, &value))
344     return false;
345 
346   // Title
347   base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
348                         base::OnStringConversionError::SKIP, title);
349   *title = net::UnescapeForHTML(*title);
350 
351   // URL
352   if (GetAttribute(attribute_list, kHrefAttribute, &value)) {
353     base::string16 url16;
354     base::CodepageToUTF16(value, charset.c_str(),
355                           base::OnStringConversionError::SKIP, &url16);
356     url16 = net::UnescapeForHTML(url16);
357 
358     *url = GURL(url16);
359   }
360 
361   // Favicon
362   if (GetAttribute(attribute_list, kIconAttribute, &value))
363     *favicon = GURL(value);
364 
365   // Keyword
366   if (GetAttribute(attribute_list, kShortcutURLAttribute, &value)) {
367     base::CodepageToUTF16(value, charset.c_str(),
368                           base::OnStringConversionError::SKIP, shortcut);
369     *shortcut = net::UnescapeForHTML(*shortcut);
370   }
371 
372   // Add date
373   if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
374     int64 time;
375     base::StringToInt64(value, &time);
376     // Upper bound it at 32 bits.
377     if (0 < time && time < (1LL << 32))
378       *add_date = base::Time::FromTimeT(time);
379   }
380 
381   // Post data.
382   if (GetAttribute(attribute_list, kPostDataAttribute, &value)) {
383     base::CodepageToUTF16(value, charset.c_str(),
384                           base::OnStringConversionError::SKIP, post_data);
385     *post_data = net::UnescapeForHTML(*post_data);
386   }
387 
388   return true;
389 }
390 
ParseMinimumBookmarkFromLine(const std::string & line,const std::string & charset,base::string16 * title,GURL * url)391 bool ParseMinimumBookmarkFromLine(const std::string& line,
392                                   const std::string& charset,
393                                   base::string16* title,
394                                   GURL* url) {
395   const char kItemOpen[] = "<DT><A";
396   const char kItemClose[] = "</";
397   const char kHrefAttributeUpper[] = "HREF";
398   const char kHrefAttributeLower[] = "href";
399 
400   title->clear();
401   *url = GURL();
402 
403   // Case-insensitive check of open tag.
404   if (!StartsWithASCII(line, kItemOpen, false))
405     return false;
406 
407   // Find any close tag.
408   size_t end = line.find(kItemClose);
409   size_t tag_end = line.rfind('>', end) + 1;
410   if (end == std::string::npos || tag_end < arraysize(kItemOpen))
411     return false;  // No end tag or start tag is broken.
412 
413   std::string attribute_list = line.substr(arraysize(kItemOpen),
414       tag_end - arraysize(kItemOpen) - 1);
415 
416   // Title
417   base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
418                         base::OnStringConversionError::SKIP, title);
419   *title = net::UnescapeForHTML(*title);
420 
421   // URL
422   std::string value;
423   if (GetAttribute(attribute_list, kHrefAttributeUpper, &value) ||
424       GetAttribute(attribute_list, kHrefAttributeLower, &value)) {
425     if (charset.length() != 0) {
426       base::string16 url16;
427       base::CodepageToUTF16(value, charset.c_str(),
428                             base::OnStringConversionError::SKIP, &url16);
429       url16 = net::UnescapeForHTML(url16);
430 
431       *url = GURL(url16);
432     } else {
433       *url = GURL(value);
434     }
435   }
436 
437   return true;
438 }
439 
440 }  // namespace internal
441 
442 }  // namespace bookmark_html_reader
443