1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is the Mork Reader.
16 *
17 * The Initial Developer of the Original Code is
18 * Google Inc.
19 * Portions created by the Initial Developer are Copyright (C) 2006
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 * Brian Ryner <bryner@brianryner.com> (original author)
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
36 *
37 * ***** END LICENSE BLOCK ***** */
38
39 // Source:
40 // http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
41 // This file has been converted to google style.
42
43 #include "chrome/browser/importer/mork_reader.h"
44
45 #include <algorithm>
46
47 #include "base/file_path.h"
48 #include "base/i18n/icu_string_conversions.h"
49 #include "base/logging.h"
50 #include "base/message_loop.h"
51 #include "base/string_number_conversions.h"
52 #include "base/string_util.h"
53 #include "chrome/browser/history/history_types.h"
54 #include "chrome/browser/importer/firefox_importer_utils.h"
55 #include "chrome/browser/importer/importer_bridge.h"
56
57 namespace {
58
59 // Convert a hex character (0-9, A-F) to its corresponding byte value.
60 // Returns -1 if the character is invalid.
HexCharToInt(char c)61 inline int HexCharToInt(char c) {
62 if ('0' <= c && c <= '9')
63 return c - '0';
64 if ('A' <= c && c <= 'F')
65 return c - 'A' + 10;
66 return -1;
67 }
68
69 // Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII
70 // characters. Additionally, '$' and '\' are backslash-escaped.
71 // The result of the unescape is in returned.
MorkUnescape(const std::string & input)72 std::string MorkUnescape(const std::string& input) {
73 // We optimize for speed over space here -- size the result buffer to
74 // the size of the source, which is an upper bound on the size of the
75 // unescaped string.
76 std::string result;
77 size_t input_length = input.size();
78 result.reserve(input_length);
79
80 for (size_t i = 0; i < input_length; i++) {
81 char c = input[i];
82 if (c == '\\') {
83 // Escaped literal, slip the backslash, append the next character.
84 i++;
85 if (i < input_length)
86 result.push_back(input[i]);
87 } else if (c == '$') {
88 // Dollar sign denotes a hex character.
89 if (i < input_length - 2) {
90 // Would be nice to use ToInteger() here, but it currently
91 // requires a null-terminated string.
92 int first = HexCharToInt(input[++i]);
93 int second = HexCharToInt(input[++i]);
94 if (first >= 0 && second >= 0)
95 result.push_back((first << 4) | second);
96 }
97 } else {
98 // Regular character, just append.
99 result.push_back(input[i]);
100 }
101 }
102 return result;
103 }
104
105 } // namespace
106
MorkReader()107 MorkReader::MorkReader() {
108 }
109
~MorkReader()110 MorkReader::~MorkReader() {
111 // Need to delete all the pointers to vectors we have in the table.
112 for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
113 delete i->second;
114 }
115
Read(const FilePath & path)116 bool MorkReader::Read(const FilePath& path) {
117 stream_.open(path.value().c_str());
118 if (!stream_.is_open())
119 return false;
120
121 std::string line;
122 if (!ReadLine(&line) ||
123 line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
124 return false; // Unexpected file format.
125
126 IndexMap column_map;
127 while (ReadLine(&line)) {
128 // Trim off leading spaces
129 size_t idx = 0;
130 size_t len = line.size();
131 while (idx < len && line[idx] == ' ')
132 ++idx;
133 if (idx >= len)
134 continue;
135
136 // Look at the line to figure out what section type this is
137 if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
138 // Column map. We begin by creating a hash of column id to column name.
139 StringMap column_name_map;
140 ParseMap(line, idx, &column_name_map);
141
142 // Now that we have the list of columns, we put them into a flat array.
143 // Rows will have value arrays of the same size, with indexes that
144 // correspond to the columns array. As we insert each column into the
145 // array, we also make an entry in columnMap so that we can look up the
146 // index given the column id.
147 columns_.reserve(column_name_map.size());
148
149 for (StringMap::const_iterator i = column_name_map.begin();
150 i != column_name_map.end(); ++i) {
151 column_map[i->first] = static_cast<int>(columns_.size());
152 MorkColumn col(i->first, i->second);
153 columns_.push_back(col);
154 }
155 } else if (StartsWithASCII(&line[idx], "<(", true)) {
156 // Value map.
157 ParseMap(line, idx, &value_map_);
158 } else if (line[idx] == '{' || line[idx] == '[') {
159 // Table / table row.
160 ParseTable(line, idx, &column_map);
161 } else {
162 // Don't know, hopefully don't care.
163 }
164 }
165 return true;
166 }
167
168 // Parses a key/value map of the form
169 // <(k1=v1)(k2=v2)...>
ParseMap(const std::string & first_line,size_t start_index,StringMap * map)170 bool MorkReader::ParseMap(const std::string& first_line,
171 size_t start_index,
172 StringMap* map) {
173 // If the first line is the a=c line (column map), just skip over it.
174 std::string line(first_line);
175 if (StartsWithASCII(line, "< <(a=c)>", true))
176 ReadLine(&line);
177
178 std::string key;
179 do {
180 size_t idx = start_index;
181 size_t len = line.size();
182 size_t token_start;
183
184 while (idx < len) {
185 switch (line[idx++]) {
186 case '(':
187 // Beginning of a key/value pair.
188 if (!key.empty()) {
189 DLOG(WARNING) << "unterminated key/value pair?";
190 key.clear();
191 }
192
193 token_start = idx;
194 while (idx < len && line[idx] != '=')
195 ++idx;
196 key.assign(&line[token_start], idx - token_start);
197 break;
198
199 case '=': {
200 // Beginning of the value.
201 if (key.empty()) {
202 DLOG(WARNING) << "stray value";
203 break;
204 }
205
206 token_start = idx;
207 while (idx < len && line[idx] != ')') {
208 if (line[idx] == '\\')
209 ++idx; // Skip escaped ')' characters.
210 ++idx;
211 }
212 size_t token_end = std::min(idx, len);
213 ++idx;
214
215 std::string value = MorkUnescape(
216 std::string(&line[token_start], token_end - token_start));
217 (*map)[key] = value;
218 key.clear();
219 break;
220 }
221 case '>':
222 // End of the map.
223 DLOG_IF(WARNING, key.empty()) <<
224 "map terminates inside of key/value pair";
225 return true;
226 }
227 }
228
229 // We should start reading the next line at the beginning.
230 start_index = 0;
231 } while (ReadLine(&line));
232
233 // We ran out of lines and the map never terminated. This probably indicates
234 // a parsing error.
235 DLOG(WARNING) << "didn't find end of key/value map";
236 return false;
237 }
238
239 // Parses a table row of the form [123(^45^67)..]
240 // (row id 123 has the value with id 67 for the column with id 45).
241 // A '^' prefix for a column or value references an entry in the column or
242 // value map. '=' is used as the separator when the value is a literal.
ParseTable(const std::string & first_line,size_t start_index,const IndexMap * column_map)243 void MorkReader::ParseTable(const std::string& first_line,
244 size_t start_index,
245 const IndexMap* column_map) {
246 std::string line(first_line);
247
248 // Column index of the cell we're parsing, minus one if invalid.
249 int column_index = -1;
250
251 // Points to the current row we're parsing inside of the |table_|, will be
252 // NULL if we're not inside a row.
253 ColumnDataList* current_row = NULL;
254
255 bool in_meta_row = false;
256
257 do {
258 size_t idx = start_index;
259 size_t len = line.size();
260
261 while (idx < len) {
262 switch (line[idx++]) {
263 case '{':
264 // This marks the beginning of a table section. There's a lot of
265 // junk before the first row that looks like cell values but isn't.
266 // Skip to the first '['.
267 while (idx < len && line[idx] != '[') {
268 if (line[idx] == '{') {
269 in_meta_row = true; // The meta row is enclosed in { }
270 } else if (line[idx] == '}') {
271 in_meta_row = false;
272 }
273 ++idx;
274 }
275 break;
276
277 case '[': {
278 // Start of a new row. Consume the row id, up to the first '('.
279 // Row edits also have a table namespace, separated from the row id
280 // by a colon. We don't make use of the namespace, but we need to
281 // make sure not to consider it part of the row id.
282 if (current_row) {
283 DLOG(WARNING) << "unterminated row?";
284 current_row = NULL;
285 }
286
287 // Check for a '-' at the start of the id. This signifies that
288 // if the row already exists, we should delete all columns from it
289 // before adding the new values.
290 bool cut_columns;
291 if (idx < len && line[idx] == '-') {
292 cut_columns = true;
293 ++idx;
294 } else {
295 cut_columns = false;
296 }
297
298 // Locate the range of the ID.
299 size_t token_start = idx; // Index of the first char of the token.
300 while (idx < len &&
301 line[idx] != '(' &&
302 line[idx] != ']' &&
303 line[idx] != ':') {
304 ++idx;
305 }
306 size_t token_end = idx; // Index of the char following the token.
307 while (idx < len && line[idx] != '(' && line[idx] != ']') {
308 ++idx;
309 }
310
311 if (in_meta_row) {
312 // Need to create the meta row.
313 meta_row_.resize(columns_.size());
314 current_row = &meta_row_;
315 } else {
316 // Find or create the regular row for this.
317 IDString row_id(&line[token_start], token_end - token_start);
318 RowMap::iterator found_row = table_.find(row_id);
319 if (found_row == table_.end()) {
320 // We don't already have this row, create a new one for it.
321 current_row = new ColumnDataList(columns_.size());
322 table_[row_id] = current_row;
323 } else {
324 // The row already exists and we're adding/replacing things.
325 current_row = found_row->second;
326 }
327 }
328 if (cut_columns) {
329 for (size_t i = 0; i < current_row->size(); ++i)
330 (*current_row)[i].clear();
331 }
332 break;
333 }
334
335 case ']':
336 // We're done with the row.
337 current_row = NULL;
338 in_meta_row = false;
339 break;
340
341 case '(': {
342 if (!current_row) {
343 DLOG(WARNING) << "cell value outside of row";
344 break;
345 }
346
347 bool column_is_atom;
348 if (line[idx] == '^') {
349 column_is_atom = true;
350 ++idx; // This is not part of the column id, advance past it.
351 } else {
352 column_is_atom = false;
353 }
354 size_t token_start = idx;
355 while (idx < len && line[idx] != '^' && line[idx] != '=') {
356 if (line[idx] == '\\')
357 ++idx; // Skip escaped characters.
358 ++idx;
359 }
360
361 size_t token_end = std::min(idx, len);
362
363 IDString column;
364 if (column_is_atom)
365 column.assign(&line[token_start], token_end - token_start);
366 else
367 column = MorkUnescape(line.substr(token_start,
368 token_end - token_start));
369
370 IndexMap::const_iterator found_column = column_map->find(column);
371 if (found_column == column_map->end()) {
372 DLOG(WARNING) << "Column not in column map, discarding it";
373 column_index = -1;
374 } else {
375 column_index = found_column->second;
376 }
377 break;
378 }
379
380 case '=':
381 case '^': {
382 if (column_index == -1) {
383 DLOG(WARNING) << "stray ^ or = marker";
384 break;
385 }
386
387 bool value_is_atom = (line[idx - 1] == '^');
388 size_t token_start = idx - 1; // Include the '=' or '^' marker.
389 while (idx < len && line[idx] != ')') {
390 if (line[idx] == '\\')
391 ++idx; // Skip escaped characters.
392 ++idx;
393 }
394 size_t token_end = std::min(idx, len);
395 ++idx;
396
397 if (value_is_atom) {
398 (*current_row)[column_index].assign(&line[token_start],
399 token_end - token_start);
400 } else {
401 (*current_row)[column_index] =
402 MorkUnescape(line.substr(token_start, token_end - token_start));
403 }
404 column_index = -1;
405 }
406 break;
407 }
408 }
409
410 // Start parsing the next line at the beginning.
411 start_index = 0;
412 } while (current_row && ReadLine(&line));
413 }
414
ReadLine(std::string * line)415 bool MorkReader::ReadLine(std::string* line) {
416 line->resize(256);
417 std::getline(stream_, *line);
418 if (stream_.eof() || stream_.bad())
419 return false;
420
421 while (!line->empty() && (*line)[line->size() - 1] == '\\') {
422 // There is a continuation for this line. Read it and append.
423 std::string new_line;
424 std::getline(stream_, new_line);
425 if (stream_.eof())
426 return false;
427 line->erase(line->size() - 1);
428 line->append(new_line);
429 }
430
431 return true;
432 }
433
NormalizeValue(std::string * value) const434 void MorkReader::NormalizeValue(std::string* value) const {
435 if (value->empty())
436 return;
437 MorkReader::StringMap::const_iterator i;
438 switch (value->at(0)) {
439 case '^':
440 // Hex ID, lookup the name for it in the |value_map_|.
441 i = value_map_.find(value->substr(1));
442 if (i == value_map_.end())
443 value->clear();
444 else
445 *value = i->second;
446 break;
447 case '=':
448 // Just use the literal after the equals sign.
449 value->erase(value->begin());
450 break;
451 default:
452 // Anything else is invalid.
453 value->clear();
454 break;
455 }
456 }
457
458 // Source:
459 // http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
460
461 // Columns for entry (non-meta) history rows
462 enum {
463 kURLColumn,
464 kNameColumn,
465 kVisitCountColumn,
466 kHiddenColumn,
467 kTypedColumn,
468 kLastVisitColumn,
469 kColumnCount // Keep me last.
470 };
471
472 static const char * const gColumnNames[] = {
473 "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
474 };
475
476 struct TableReadClosure {
TableReadClosureTableReadClosure477 explicit TableReadClosure(const MorkReader& r)
478 : reader(r),
479 swap_bytes(false),
480 byte_order_column(-1) {
481 for (int i = 0; i < kColumnCount; ++i)
482 column_indexes[i] = -1;
483 }
484
485 // Backpointers to the reader and history we're operating on.
486 const MorkReader& reader;
487
488 // Whether we need to swap bytes (file format is other-endian).
489 bool swap_bytes;
490
491 // Indexes of the columns that we care about.
492 int column_indexes[kColumnCount];
493 int byte_order_column;
494 };
495
AddToHistory(MorkReader::ColumnDataList * column_values,const TableReadClosure & data,std::vector<history::URLRow> * rows)496 void AddToHistory(MorkReader::ColumnDataList* column_values,
497 const TableReadClosure& data,
498 std::vector<history::URLRow>* rows) {
499 std::string values[kColumnCount];
500
501 for (size_t i = 0; i < kColumnCount; ++i) {
502 if (data.column_indexes[i] != -1) {
503 values[i] = column_values->at(data.column_indexes[i]);
504 data.reader.NormalizeValue(&values[i]);
505 // Do not import hidden records.
506 if (i == kHiddenColumn && values[i] == "1")
507 return;
508 }
509 }
510
511 GURL url(values[kURLColumn]);
512
513 if (CanImportURL(url)) {
514 history::URLRow row(url);
515
516 string16 title;
517 if (data.swap_bytes) {
518 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE,
519 base::OnStringConversionError::SKIP, &title);
520 } else {
521 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE,
522 base::OnStringConversionError::SKIP, &title);
523 }
524 row.set_title(title);
525
526 int count = atoi(values[kVisitCountColumn].c_str());
527 if (count == 0)
528 count = 1;
529 row.set_visit_count(count);
530
531 int64 date;
532 base::StringToInt64(values[kLastVisitColumn], &date);
533 if (date != 0)
534 row.set_last_visit(base::Time::FromTimeT(date / 1000000));
535
536 bool is_typed = (values[kTypedColumn] == "1");
537 if (is_typed)
538 row.set_typed_count(1);
539
540 rows->push_back(row);
541 }
542 }
543
544 // It sets up the file stream and loops over the lines in the file to
545 // parse them, then adds the resulting row set to history.
ImportHistoryFromFirefox2(const FilePath & file,ImporterBridge * bridge)546 void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) {
547 MorkReader reader;
548 reader.Read(file);
549
550 // Gather up the column ids so we don't need to find them on each row
551 TableReadClosure data(reader);
552 const MorkReader::MorkColumnList& columns = reader.columns();
553 for (size_t i = 0; i < columns.size(); ++i) {
554 for (int j = 0; j < kColumnCount; ++j)
555 if (columns[i].name == gColumnNames[j]) {
556 data.column_indexes[j] = static_cast<int>(i);
557 break;
558 }
559 if (columns[i].name == "ByteOrder")
560 data.byte_order_column = static_cast<int>(i);
561 }
562
563 // Determine the byte order from the table's meta-row.
564 const MorkReader::ColumnDataList& meta_row = reader.meta_row();
565 if (!meta_row.empty() && data.byte_order_column != -1) {
566 std::string byte_order = meta_row[data.byte_order_column];
567 if (!byte_order.empty()) {
568 // Note whether the file uses a non-native byte ordering.
569 // If it does, we'll have to swap bytes for PRUnichar values.
570 // "BE" and "LE" are the only recognized values, anything
571 // else is garbage and the file will be treated as native-endian
572 // (no swapping).
573 std::string byte_order_value(byte_order);
574 reader.NormalizeValue(&byte_order_value);
575 data.swap_bytes = (byte_order_value == "BE");
576 }
577 }
578
579 std::vector<history::URLRow> rows;
580 for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
581 AddToHistory(i->second, data, &rows);
582 if (!rows.empty())
583 bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED);
584 }
585