1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/utility/importer/bookmark_html_reader.h"
6
7 #include "base/callback.h"
8 #include "base/files/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/time/time.h"
14 #include "chrome/common/importer/imported_bookmark_entry.h"
15 #include "chrome/common/importer/imported_favicon_usage.h"
16 #include "chrome/utility/importer/favicon_reencode.h"
17 #include "net/base/data_url.h"
18 #include "net/base/escape.h"
19 #include "url/gurl.h"
20 #include "url/url_constants.h"
21
22 namespace {
23
24 // Fetches the given |attribute| value from the |attribute_list|. Returns true
25 // if successful, and |value| will contain the value.
GetAttribute(const std::string & attribute_list,const std::string & attribute,std::string * value)26 bool GetAttribute(const std::string& attribute_list,
27 const std::string& attribute,
28 std::string* value) {
29 const char kQuote[] = "\"";
30
31 size_t begin = attribute_list.find(attribute + "=" + kQuote);
32 if (begin == std::string::npos)
33 return false; // Can't find the attribute.
34
35 begin += attribute.size() + 2;
36 size_t end = begin + 1;
37
38 while (end < attribute_list.size()) {
39 if (attribute_list[end] == '"' &&
40 attribute_list[end - 1] != '\\') {
41 break;
42 }
43 end++;
44 }
45
46 if (end == attribute_list.size())
47 return false; // The value is not quoted.
48
49 *value = attribute_list.substr(begin, end - begin);
50 return true;
51 }
52
53 // Given the URL of a page and a favicon data URL, adds an appropriate record
54 // to the given favicon usage vector.
DataURLToFaviconUsage(const GURL & link_url,const GURL & favicon_data,std::vector<ImportedFaviconUsage> * favicons)55 void DataURLToFaviconUsage(
56 const GURL& link_url,
57 const GURL& favicon_data,
58 std::vector<ImportedFaviconUsage>* favicons) {
59 if (!link_url.is_valid() || !favicon_data.is_valid() ||
60 !favicon_data.SchemeIs(url::kDataScheme))
61 return;
62
63 // Parse the data URL.
64 std::string mime_type, char_set, data;
65 if (!net::DataURL::Parse(favicon_data, &mime_type, &char_set, &data) ||
66 data.empty())
67 return;
68
69 ImportedFaviconUsage usage;
70 if (!importer::ReencodeFavicon(
71 reinterpret_cast<const unsigned char*>(&data[0]),
72 data.size(), &usage.png_data))
73 return; // Unable to decode.
74
75 // We need to make up a URL for the favicon. We use a version of the page's
76 // URL so that we can be sure it will not collide.
77 usage.favicon_url = GURL(std::string("made-up-favicon:") + link_url.spec());
78
79 // We only have one URL per favicon for Firefox 2 bookmarks.
80 usage.urls.insert(link_url);
81
82 favicons->push_back(usage);
83 }
84
85 } // namespace
86
87 namespace bookmark_html_reader {
88
ImportBookmarksFile(const base::Callback<bool (void)> & cancellation_callback,const base::Callback<bool (const GURL &)> & valid_url_callback,const base::FilePath & file_path,std::vector<ImportedBookmarkEntry> * bookmarks,std::vector<ImportedFaviconUsage> * favicons)89 void ImportBookmarksFile(
90 const base::Callback<bool(void)>& cancellation_callback,
91 const base::Callback<bool(const GURL&)>& valid_url_callback,
92 const base::FilePath& file_path,
93 std::vector<ImportedBookmarkEntry>* bookmarks,
94 std::vector<ImportedFaviconUsage>* favicons) {
95 std::string content;
96 base::ReadFileToString(file_path, &content);
97 std::vector<std::string> lines;
98 base::SplitString(content, '\n', &lines);
99
100 base::string16 last_folder;
101 bool last_folder_on_toolbar = false;
102 bool last_folder_is_empty = true;
103 bool has_subfolder = false;
104 base::Time last_folder_add_date;
105 std::vector<base::string16> path;
106 size_t toolbar_folder_index = 0;
107 std::string charset;
108 for (size_t i = 0;
109 i < lines.size() &&
110 (cancellation_callback.is_null() || !cancellation_callback.Run());
111 ++i) {
112 std::string line;
113 base::TrimString(lines[i], " ", &line);
114
115 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
116 // separator in Firefox that Chrome does not support. Note that there can be
117 // multiple "<HR>" tags at the beginning of a single line.
118 // See http://crbug.com/257474.
119 static const char kHrTag[] = "<HR>";
120 while (StartsWithASCII(line, kHrTag, false)) {
121 line.erase(0, arraysize(kHrTag) - 1);
122 base::TrimString(line, " ", &line);
123 }
124
125 // Get the encoding of the bookmark file.
126 if (internal::ParseCharsetFromLine(line, &charset))
127 continue;
128
129 // Get the folder name.
130 if (internal::ParseFolderNameFromLine(line,
131 charset,
132 &last_folder,
133 &last_folder_on_toolbar,
134 &last_folder_add_date)) {
135 continue;
136 }
137
138 // Get the bookmark entry.
139 base::string16 title;
140 base::string16 shortcut;
141 GURL url, favicon;
142 base::Time add_date;
143 base::string16 post_data;
144 bool is_bookmark;
145 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
146 // keywords yet.
147 is_bookmark =
148 internal::ParseBookmarkFromLine(line, charset, &title,
149 &url, &favicon, &shortcut,
150 &add_date, &post_data) ||
151 internal::ParseMinimumBookmarkFromLine(line, charset, &title, &url);
152
153 if (is_bookmark)
154 last_folder_is_empty = false;
155
156 if (is_bookmark &&
157 post_data.empty() &&
158 (valid_url_callback.is_null() || valid_url_callback.Run(url))) {
159 if (toolbar_folder_index > path.size() && !path.empty()) {
160 NOTREACHED(); // error in parsing.
161 break;
162 }
163
164 ImportedBookmarkEntry entry;
165 entry.creation_time = add_date;
166 entry.url = url;
167 entry.title = title;
168
169 if (toolbar_folder_index) {
170 // The toolbar folder should be at the top level.
171 entry.in_toolbar = true;
172 entry.path.assign(path.begin() + toolbar_folder_index - 1, path.end());
173 } else {
174 // Add this bookmark to the list of |bookmarks|.
175 if (!has_subfolder && !last_folder.empty()) {
176 path.push_back(last_folder);
177 last_folder.clear();
178 }
179 entry.path.assign(path.begin(), path.end());
180 }
181 bookmarks->push_back(entry);
182
183 // Save the favicon. DataURLToFaviconUsage will handle the case where
184 // there is no favicon.
185 if (favicons)
186 DataURLToFaviconUsage(url, favicon, favicons);
187
188 continue;
189 }
190
191 // Bookmarks in sub-folder are encapsulated with <DL> tag.
192 if (StartsWithASCII(line, "<DL>", false)) {
193 has_subfolder = true;
194 if (!last_folder.empty()) {
195 path.push_back(last_folder);
196 last_folder.clear();
197 }
198 if (last_folder_on_toolbar && !toolbar_folder_index)
199 toolbar_folder_index = path.size();
200
201 // Mark next folder empty as initial state.
202 last_folder_is_empty = true;
203 } else if (StartsWithASCII(line, "</DL>", false)) {
204 if (path.empty())
205 break; // Mismatch <DL>.
206
207 base::string16 folder_title = path.back();
208 path.pop_back();
209
210 if (last_folder_is_empty) {
211 // Empty folder should be added explicitly.
212 ImportedBookmarkEntry entry;
213 entry.is_folder = true;
214 entry.creation_time = last_folder_add_date;
215 entry.title = folder_title;
216 if (toolbar_folder_index) {
217 // The toolbar folder should be at the top level.
218 // Make sure we don't add the toolbar folder itself if it is empty.
219 if (toolbar_folder_index <= path.size()) {
220 entry.in_toolbar = true;
221 entry.path.assign(path.begin() + toolbar_folder_index - 1,
222 path.end());
223 bookmarks->push_back(entry);
224 }
225 } else {
226 // Add this folder to the list of |bookmarks|.
227 entry.path.assign(path.begin(), path.end());
228 bookmarks->push_back(entry);
229 }
230
231 // Parent folder include current one, so it's not empty.
232 last_folder_is_empty = false;
233 }
234
235 if (toolbar_folder_index > path.size())
236 toolbar_folder_index = 0;
237 }
238 }
239 }
240
241 namespace internal {
242
ParseCharsetFromLine(const std::string & line,std::string * charset)243 bool ParseCharsetFromLine(const std::string& line, std::string* charset) {
244 const char kCharset[] = "charset=";
245 if (StartsWithASCII(line, "<META", false) &&
246 (line.find("CONTENT=\"") != std::string::npos ||
247 line.find("content=\"") != std::string::npos)) {
248 size_t begin = line.find(kCharset);
249 if (begin == std::string::npos)
250 return false;
251 begin += std::string(kCharset).size();
252 size_t end = line.find_first_of('\"', begin);
253 *charset = line.substr(begin, end - begin);
254 return true;
255 }
256 return false;
257 }
258
ParseFolderNameFromLine(const std::string & line,const std::string & charset,base::string16 * folder_name,bool * is_toolbar_folder,base::Time * add_date)259 bool ParseFolderNameFromLine(const std::string& line,
260 const std::string& charset,
261 base::string16* folder_name,
262 bool* is_toolbar_folder,
263 base::Time* add_date) {
264 const char kFolderOpen[] = "<DT><H3";
265 const char kFolderClose[] = "</H3>";
266 const char kToolbarFolderAttribute[] = "PERSONAL_TOOLBAR_FOLDER";
267 const char kAddDateAttribute[] = "ADD_DATE";
268
269 if (!StartsWithASCII(line, kFolderOpen, true))
270 return false;
271
272 size_t end = line.find(kFolderClose);
273 size_t tag_end = line.rfind('>', end) + 1;
274 // If no end tag or start tag is broken, we skip to find the folder name.
275 if (end == std::string::npos || tag_end < arraysize(kFolderOpen))
276 return false;
277
278 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
279 base::OnStringConversionError::SKIP, folder_name);
280 *folder_name = net::UnescapeForHTML(*folder_name);
281
282 std::string attribute_list = line.substr(arraysize(kFolderOpen),
283 tag_end - arraysize(kFolderOpen) - 1);
284 std::string value;
285
286 // Add date
287 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
288 int64 time;
289 base::StringToInt64(value, &time);
290 // Upper bound it at 32 bits.
291 if (0 < time && time < (1LL << 32))
292 *add_date = base::Time::FromTimeT(time);
293 }
294
295 if (GetAttribute(attribute_list, kToolbarFolderAttribute, &value) &&
296 LowerCaseEqualsASCII(value, "true"))
297 *is_toolbar_folder = true;
298 else
299 *is_toolbar_folder = false;
300
301 return true;
302 }
303
ParseBookmarkFromLine(const std::string & line,const std::string & charset,base::string16 * title,GURL * url,GURL * favicon,base::string16 * shortcut,base::Time * add_date,base::string16 * post_data)304 bool ParseBookmarkFromLine(const std::string& line,
305 const std::string& charset,
306 base::string16* title,
307 GURL* url,
308 GURL* favicon,
309 base::string16* shortcut,
310 base::Time* add_date,
311 base::string16* post_data) {
312 const char kItemOpen[] = "<DT><A";
313 const char kItemClose[] = "</A>";
314 const char kFeedURLAttribute[] = "FEEDURL";
315 const char kHrefAttribute[] = "HREF";
316 const char kIconAttribute[] = "ICON";
317 const char kShortcutURLAttribute[] = "SHORTCUTURL";
318 const char kAddDateAttribute[] = "ADD_DATE";
319 const char kPostDataAttribute[] = "POST_DATA";
320
321 title->clear();
322 *url = GURL();
323 *favicon = GURL();
324 shortcut->clear();
325 post_data->clear();
326 *add_date = base::Time();
327
328 if (!StartsWithASCII(line, kItemOpen, true))
329 return false;
330
331 size_t end = line.find(kItemClose);
332 size_t tag_end = line.rfind('>', end) + 1;
333 if (end == std::string::npos || tag_end < arraysize(kItemOpen))
334 return false; // No end tag or start tag is broken.
335
336 std::string attribute_list = line.substr(arraysize(kItemOpen),
337 tag_end - arraysize(kItemOpen) - 1);
338
339 // We don't import Live Bookmark folders, which is Firefox's RSS reading
340 // feature, since the user never necessarily bookmarked them and we don't
341 // have this feature to update their contents.
342 std::string value;
343 if (GetAttribute(attribute_list, kFeedURLAttribute, &value))
344 return false;
345
346 // Title
347 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
348 base::OnStringConversionError::SKIP, title);
349 *title = net::UnescapeForHTML(*title);
350
351 // URL
352 if (GetAttribute(attribute_list, kHrefAttribute, &value)) {
353 base::string16 url16;
354 base::CodepageToUTF16(value, charset.c_str(),
355 base::OnStringConversionError::SKIP, &url16);
356 url16 = net::UnescapeForHTML(url16);
357
358 *url = GURL(url16);
359 }
360
361 // Favicon
362 if (GetAttribute(attribute_list, kIconAttribute, &value))
363 *favicon = GURL(value);
364
365 // Keyword
366 if (GetAttribute(attribute_list, kShortcutURLAttribute, &value)) {
367 base::CodepageToUTF16(value, charset.c_str(),
368 base::OnStringConversionError::SKIP, shortcut);
369 *shortcut = net::UnescapeForHTML(*shortcut);
370 }
371
372 // Add date
373 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
374 int64 time;
375 base::StringToInt64(value, &time);
376 // Upper bound it at 32 bits.
377 if (0 < time && time < (1LL << 32))
378 *add_date = base::Time::FromTimeT(time);
379 }
380
381 // Post data.
382 if (GetAttribute(attribute_list, kPostDataAttribute, &value)) {
383 base::CodepageToUTF16(value, charset.c_str(),
384 base::OnStringConversionError::SKIP, post_data);
385 *post_data = net::UnescapeForHTML(*post_data);
386 }
387
388 return true;
389 }
390
ParseMinimumBookmarkFromLine(const std::string & line,const std::string & charset,base::string16 * title,GURL * url)391 bool ParseMinimumBookmarkFromLine(const std::string& line,
392 const std::string& charset,
393 base::string16* title,
394 GURL* url) {
395 const char kItemOpen[] = "<DT><A";
396 const char kItemClose[] = "</";
397 const char kHrefAttributeUpper[] = "HREF";
398 const char kHrefAttributeLower[] = "href";
399
400 title->clear();
401 *url = GURL();
402
403 // Case-insensitive check of open tag.
404 if (!StartsWithASCII(line, kItemOpen, false))
405 return false;
406
407 // Find any close tag.
408 size_t end = line.find(kItemClose);
409 size_t tag_end = line.rfind('>', end) + 1;
410 if (end == std::string::npos || tag_end < arraysize(kItemOpen))
411 return false; // No end tag or start tag is broken.
412
413 std::string attribute_list = line.substr(arraysize(kItemOpen),
414 tag_end - arraysize(kItemOpen) - 1);
415
416 // Title
417 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
418 base::OnStringConversionError::SKIP, title);
419 *title = net::UnescapeForHTML(*title);
420
421 // URL
422 std::string value;
423 if (GetAttribute(attribute_list, kHrefAttributeUpper, &value) ||
424 GetAttribute(attribute_list, kHrefAttributeLower, &value)) {
425 if (charset.length() != 0) {
426 base::string16 url16;
427 base::CodepageToUTF16(value, charset.c_str(),
428 base::OnStringConversionError::SKIP, &url16);
429 url16 = net::UnescapeForHTML(url16);
430
431 *url = GURL(url16);
432 } else {
433 *url = GURL(value);
434 }
435 }
436
437 return true;
438 }
439
440 } // namespace internal
441
442 } // namespace bookmark_html_reader
443