• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4  *
5  * The contents of this file are subject to the Mozilla Public License Version
6  * 1.1 (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  * http://www.mozilla.org/MPL/
9  *
10  * Software distributed under the License is distributed on an "AS IS" basis,
11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12  * for the specific language governing rights and limitations under the
13  * License.
14  *
15  * The Original Code is the Mork Reader.
16  *
17  * The Initial Developer of the Original Code is
18  * Google Inc.
19  * Portions created by the Initial Developer are Copyright (C) 2006
20  * the Initial Developer. All Rights Reserved.
21  *
22  * Contributor(s):
23  *   Brian Ryner <bryner@brianryner.com> (original author)
24  *
25  * Alternatively, the contents of this file may be used under the terms of
26  * either the GNU General Public License Version 2 or later (the "GPL"), or
27  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28  * in which case the provisions of the GPL or the LGPL are applicable instead
29  * of those above. If you wish to allow use of your version of this file only
30  * under the terms of either the GPL or the LGPL, and not to allow others to
31  * use your version of this file under the terms of the MPL, indicate your
32  * decision by deleting the provisions above and replace them with the notice
33  * and other provisions required by the GPL or the LGPL. If you do not delete
34  * the provisions above, a recipient may use your version of this file under
35  * the terms of any one of the MPL, the GPL or the LGPL.
36  *
37  * ***** END LICENSE BLOCK ***** */
38 
39 // Source:
40 // http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
41 // This file has been converted to google style.
42 
43 #include "chrome/browser/importer/mork_reader.h"
44 
45 #include <algorithm>
46 
47 #include "base/file_path.h"
48 #include "base/i18n/icu_string_conversions.h"
49 #include "base/logging.h"
50 #include "base/message_loop.h"
51 #include "base/string_number_conversions.h"
52 #include "base/string_util.h"
53 #include "chrome/browser/history/history_types.h"
54 #include "chrome/browser/importer/firefox_importer_utils.h"
55 #include "chrome/browser/importer/importer_bridge.h"
56 
57 namespace {
58 
59 // Convert a hex character (0-9, A-F) to its corresponding byte value.
60 // Returns -1 if the character is invalid.
HexCharToInt(char c)61 inline int HexCharToInt(char c) {
62   if ('0' <= c && c <= '9')
63     return c - '0';
64   if ('A' <= c && c <= 'F')
65     return c - 'A' + 10;
66   return -1;
67 }
68 
69 // Unescape a Mork value.  Mork uses $xx escaping to encode non-ASCII
70 // characters.  Additionally, '$' and '\' are backslash-escaped.
71 // The result of the unescape is in returned.
MorkUnescape(const std::string & input)72 std::string MorkUnescape(const std::string& input) {
73   // We optimize for speed over space here -- size the result buffer to
74   // the size of the source, which is an upper bound on the size of the
75   // unescaped string.
76   std::string result;
77   size_t input_length = input.size();
78   result.reserve(input_length);
79 
80   for (size_t i = 0; i < input_length; i++) {
81     char c = input[i];
82     if (c == '\\') {
83       // Escaped literal, slip the backslash, append the next character.
84       i++;
85       if (i < input_length)
86         result.push_back(input[i]);
87     } else if (c == '$') {
88       // Dollar sign denotes a hex character.
89       if (i < input_length - 2) {
90         // Would be nice to use ToInteger() here, but it currently
91         // requires a null-terminated string.
92         int first = HexCharToInt(input[++i]);
93         int second = HexCharToInt(input[++i]);
94         if (first >= 0 && second >= 0)
95           result.push_back((first << 4) | second);
96       }
97     } else {
98       // Regular character, just append.
99       result.push_back(input[i]);
100     }
101   }
102   return result;
103 }
104 
105 }  // namespace
106 
MorkReader()107 MorkReader::MorkReader() {
108 }
109 
~MorkReader()110 MorkReader::~MorkReader() {
111   // Need to delete all the pointers to vectors we have in the table.
112   for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
113     delete i->second;
114 }
115 
Read(const FilePath & path)116 bool MorkReader::Read(const FilePath& path) {
117   stream_.open(path.value().c_str());
118   if (!stream_.is_open())
119     return false;
120 
121   std::string line;
122   if (!ReadLine(&line) ||
123       line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
124     return false;  // Unexpected file format.
125 
126   IndexMap column_map;
127   while (ReadLine(&line)) {
128     // Trim off leading spaces
129     size_t idx = 0;
130     size_t len = line.size();
131     while (idx < len && line[idx] == ' ')
132       ++idx;
133     if (idx >= len)
134       continue;
135 
136     // Look at the line to figure out what section type this is
137     if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
138       // Column map.  We begin by creating a hash of column id to column name.
139       StringMap column_name_map;
140       ParseMap(line, idx, &column_name_map);
141 
142       // Now that we have the list of columns, we put them into a flat array.
143       // Rows will have value arrays of the same size, with indexes that
144       // correspond to the columns array.  As we insert each column into the
145       // array, we also make an entry in columnMap so that we can look up the
146       // index given the column id.
147       columns_.reserve(column_name_map.size());
148 
149       for (StringMap::const_iterator i = column_name_map.begin();
150            i != column_name_map.end(); ++i) {
151         column_map[i->first] = static_cast<int>(columns_.size());
152         MorkColumn col(i->first, i->second);
153         columns_.push_back(col);
154       }
155     } else if (StartsWithASCII(&line[idx], "<(", true)) {
156       // Value map.
157       ParseMap(line, idx, &value_map_);
158     } else if (line[idx] == '{' || line[idx] == '[') {
159       // Table / table row.
160       ParseTable(line, idx, &column_map);
161     } else {
162       // Don't know, hopefully don't care.
163     }
164   }
165   return true;
166 }
167 
168 // Parses a key/value map of the form
169 // <(k1=v1)(k2=v2)...>
ParseMap(const std::string & first_line,size_t start_index,StringMap * map)170 bool MorkReader::ParseMap(const std::string& first_line,
171                           size_t start_index,
172                           StringMap* map) {
173   // If the first line is the a=c line (column map), just skip over it.
174   std::string line(first_line);
175   if (StartsWithASCII(line, "< <(a=c)>", true))
176     ReadLine(&line);
177 
178   std::string key;
179   do {
180     size_t idx = start_index;
181     size_t len = line.size();
182     size_t token_start;
183 
184     while (idx < len) {
185       switch (line[idx++]) {
186         case '(':
187           // Beginning of a key/value pair.
188           if (!key.empty()) {
189             DLOG(WARNING) << "unterminated key/value pair?";
190             key.clear();
191           }
192 
193           token_start = idx;
194           while (idx < len && line[idx] != '=')
195             ++idx;
196           key.assign(&line[token_start], idx - token_start);
197           break;
198 
199         case '=': {
200           // Beginning of the value.
201           if (key.empty()) {
202             DLOG(WARNING) << "stray value";
203             break;
204           }
205 
206           token_start = idx;
207           while (idx < len && line[idx] != ')') {
208             if (line[idx] == '\\')
209               ++idx;  // Skip escaped ')' characters.
210             ++idx;
211           }
212           size_t token_end = std::min(idx, len);
213           ++idx;
214 
215           std::string value = MorkUnescape(
216               std::string(&line[token_start], token_end - token_start));
217           (*map)[key] = value;
218           key.clear();
219           break;
220         }
221         case '>':
222           // End of the map.
223           DLOG_IF(WARNING, key.empty()) <<
224               "map terminates inside of key/value pair";
225           return true;
226       }
227     }
228 
229     // We should start reading the next line at the beginning.
230     start_index = 0;
231   } while (ReadLine(&line));
232 
233   // We ran out of lines and the map never terminated.  This probably indicates
234   // a parsing error.
235   DLOG(WARNING) << "didn't find end of key/value map";
236   return false;
237 }
238 
239 // Parses a table row of the form [123(^45^67)..]
240 // (row id 123 has the value with id 67 for the column with id 45).
241 // A '^' prefix for a column or value references an entry in the column or
242 // value map.  '=' is used as the separator when the value is a literal.
ParseTable(const std::string & first_line,size_t start_index,const IndexMap * column_map)243 void MorkReader::ParseTable(const std::string& first_line,
244                             size_t start_index,
245                             const IndexMap* column_map) {
246   std::string line(first_line);
247 
248   // Column index of the cell we're parsing, minus one if invalid.
249   int column_index = -1;
250 
251   // Points to the current row we're parsing inside of the |table_|, will be
252   // NULL if we're not inside a row.
253   ColumnDataList* current_row = NULL;
254 
255   bool in_meta_row = false;
256 
257   do {
258     size_t idx = start_index;
259     size_t len = line.size();
260 
261     while (idx < len) {
262       switch (line[idx++]) {
263         case '{':
264           // This marks the beginning of a table section.  There's a lot of
265           // junk before the first row that looks like cell values but isn't.
266           // Skip to the first '['.
267           while (idx < len && line[idx] != '[') {
268             if (line[idx] == '{') {
269               in_meta_row = true;  // The meta row is enclosed in { }
270             } else if (line[idx] == '}') {
271               in_meta_row = false;
272             }
273             ++idx;
274           }
275           break;
276 
277         case '[': {
278           // Start of a new row.  Consume the row id, up to the first '('.
279           // Row edits also have a table namespace, separated from the row id
280           // by a colon.  We don't make use of the namespace, but we need to
281           // make sure not to consider it part of the row id.
282           if (current_row) {
283             DLOG(WARNING) << "unterminated row?";
284             current_row = NULL;
285           }
286 
287           // Check for a '-' at the start of the id.  This signifies that
288           // if the row already exists, we should delete all columns from it
289           // before adding the new values.
290           bool cut_columns;
291           if (idx < len && line[idx] == '-') {
292             cut_columns = true;
293             ++idx;
294           } else {
295             cut_columns = false;
296           }
297 
298           // Locate the range of the ID.
299           size_t token_start = idx;  // Index of the first char of the token.
300           while (idx < len &&
301                  line[idx] != '(' &&
302                  line[idx] != ']' &&
303                  line[idx] != ':') {
304             ++idx;
305           }
306           size_t token_end = idx;  // Index of the char following the token.
307           while (idx < len && line[idx] != '(' && line[idx] != ']') {
308             ++idx;
309           }
310 
311           if (in_meta_row) {
312             // Need to create the meta row.
313             meta_row_.resize(columns_.size());
314             current_row = &meta_row_;
315           } else {
316             // Find or create the regular row for this.
317             IDString row_id(&line[token_start], token_end - token_start);
318             RowMap::iterator found_row = table_.find(row_id);
319             if (found_row == table_.end()) {
320               // We don't already have this row, create a new one for it.
321               current_row = new ColumnDataList(columns_.size());
322               table_[row_id] = current_row;
323             } else {
324               // The row already exists and we're adding/replacing things.
325               current_row = found_row->second;
326             }
327           }
328           if (cut_columns) {
329             for (size_t i = 0; i < current_row->size(); ++i)
330               (*current_row)[i].clear();
331           }
332           break;
333         }
334 
335         case ']':
336           // We're done with the row.
337           current_row = NULL;
338           in_meta_row = false;
339           break;
340 
341         case '(': {
342           if (!current_row) {
343             DLOG(WARNING) << "cell value outside of row";
344             break;
345           }
346 
347           bool column_is_atom;
348           if (line[idx] == '^') {
349             column_is_atom = true;
350             ++idx;  // This is not part of the column id, advance past it.
351           } else {
352             column_is_atom = false;
353           }
354           size_t token_start = idx;
355           while (idx < len && line[idx] != '^' && line[idx] != '=') {
356             if (line[idx] == '\\')
357               ++idx;  // Skip escaped characters.
358             ++idx;
359           }
360 
361           size_t token_end = std::min(idx, len);
362 
363           IDString column;
364           if (column_is_atom)
365             column.assign(&line[token_start], token_end - token_start);
366           else
367             column = MorkUnescape(line.substr(token_start,
368                                               token_end - token_start));
369 
370           IndexMap::const_iterator found_column = column_map->find(column);
371           if (found_column == column_map->end()) {
372             DLOG(WARNING) << "Column not in column map, discarding it";
373             column_index = -1;
374           } else {
375             column_index = found_column->second;
376           }
377           break;
378         }
379 
380         case '=':
381         case '^': {
382           if (column_index == -1) {
383             DLOG(WARNING) << "stray ^ or = marker";
384             break;
385           }
386 
387           bool value_is_atom = (line[idx - 1] == '^');
388           size_t token_start = idx - 1;  // Include the '=' or '^' marker.
389           while (idx < len && line[idx] != ')') {
390             if (line[idx] == '\\')
391               ++idx;  // Skip escaped characters.
392             ++idx;
393           }
394           size_t token_end = std::min(idx, len);
395           ++idx;
396 
397           if (value_is_atom) {
398             (*current_row)[column_index].assign(&line[token_start],
399                                                 token_end - token_start);
400           } else {
401             (*current_row)[column_index] =
402                 MorkUnescape(line.substr(token_start, token_end - token_start));
403           }
404           column_index = -1;
405         }
406         break;
407       }
408     }
409 
410     // Start parsing the next line at the beginning.
411     start_index = 0;
412   } while (current_row && ReadLine(&line));
413 }
414 
ReadLine(std::string * line)415 bool MorkReader::ReadLine(std::string* line) {
416   line->resize(256);
417   std::getline(stream_, *line);
418   if (stream_.eof() || stream_.bad())
419     return false;
420 
421   while (!line->empty() &&  (*line)[line->size() - 1] == '\\') {
422     // There is a continuation for this line.  Read it and append.
423     std::string new_line;
424     std::getline(stream_, new_line);
425     if (stream_.eof())
426       return false;
427     line->erase(line->size() - 1);
428     line->append(new_line);
429   }
430 
431   return true;
432 }
433 
NormalizeValue(std::string * value) const434 void MorkReader::NormalizeValue(std::string* value) const {
435   if (value->empty())
436     return;
437   MorkReader::StringMap::const_iterator i;
438   switch (value->at(0)) {
439     case '^':
440       // Hex ID, lookup the name for it in the |value_map_|.
441       i = value_map_.find(value->substr(1));
442       if (i == value_map_.end())
443         value->clear();
444       else
445         *value = i->second;
446       break;
447     case '=':
448       // Just use the literal after the equals sign.
449       value->erase(value->begin());
450       break;
451     default:
452       // Anything else is invalid.
453       value->clear();
454       break;
455   }
456 }
457 
458 // Source:
459 // http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
460 
461 // Columns for entry (non-meta) history rows
462 enum {
463   kURLColumn,
464   kNameColumn,
465   kVisitCountColumn,
466   kHiddenColumn,
467   kTypedColumn,
468   kLastVisitColumn,
469   kColumnCount  // Keep me last.
470 };
471 
472 static const char * const gColumnNames[] = {
473   "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
474 };
475 
476 struct TableReadClosure {
TableReadClosureTableReadClosure477   explicit TableReadClosure(const MorkReader& r)
478       : reader(r),
479         swap_bytes(false),
480         byte_order_column(-1) {
481     for (int i = 0; i < kColumnCount; ++i)
482       column_indexes[i] = -1;
483   }
484 
485   // Backpointers to the reader and history we're operating on.
486   const MorkReader& reader;
487 
488   // Whether we need to swap bytes (file format is other-endian).
489   bool swap_bytes;
490 
491   // Indexes of the columns that we care about.
492   int column_indexes[kColumnCount];
493   int byte_order_column;
494 };
495 
AddToHistory(MorkReader::ColumnDataList * column_values,const TableReadClosure & data,std::vector<history::URLRow> * rows)496 void AddToHistory(MorkReader::ColumnDataList* column_values,
497                   const TableReadClosure& data,
498                   std::vector<history::URLRow>* rows) {
499   std::string values[kColumnCount];
500 
501   for (size_t i = 0; i < kColumnCount; ++i) {
502     if (data.column_indexes[i] != -1) {
503       values[i] = column_values->at(data.column_indexes[i]);
504       data.reader.NormalizeValue(&values[i]);
505       // Do not import hidden records.
506       if (i == kHiddenColumn && values[i] == "1")
507         return;
508     }
509   }
510 
511   GURL url(values[kURLColumn]);
512 
513   if (CanImportURL(url)) {
514     history::URLRow row(url);
515 
516     string16 title;
517     if (data.swap_bytes) {
518       base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE,
519                             base::OnStringConversionError::SKIP, &title);
520     } else {
521       base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE,
522                             base::OnStringConversionError::SKIP, &title);
523     }
524     row.set_title(title);
525 
526     int count = atoi(values[kVisitCountColumn].c_str());
527     if (count == 0)
528       count = 1;
529     row.set_visit_count(count);
530 
531     int64 date;
532     base::StringToInt64(values[kLastVisitColumn], &date);
533     if (date != 0)
534       row.set_last_visit(base::Time::FromTimeT(date / 1000000));
535 
536     bool is_typed = (values[kTypedColumn] == "1");
537     if (is_typed)
538       row.set_typed_count(1);
539 
540     rows->push_back(row);
541   }
542 }
543 
544 // It sets up the file stream and loops over the lines in the file to
545 // parse them, then adds the resulting row set to history.
ImportHistoryFromFirefox2(const FilePath & file,ImporterBridge * bridge)546 void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) {
547   MorkReader reader;
548   reader.Read(file);
549 
550   // Gather up the column ids so we don't need to find them on each row
551   TableReadClosure data(reader);
552   const MorkReader::MorkColumnList& columns = reader.columns();
553   for (size_t i = 0; i < columns.size(); ++i) {
554     for (int j = 0; j < kColumnCount; ++j)
555       if (columns[i].name == gColumnNames[j]) {
556         data.column_indexes[j] = static_cast<int>(i);
557         break;
558       }
559     if (columns[i].name == "ByteOrder")
560       data.byte_order_column = static_cast<int>(i);
561   }
562 
563   // Determine the byte order from the table's meta-row.
564   const MorkReader::ColumnDataList& meta_row = reader.meta_row();
565   if (!meta_row.empty() && data.byte_order_column != -1) {
566     std::string byte_order = meta_row[data.byte_order_column];
567     if (!byte_order.empty()) {
568       // Note whether the file uses a non-native byte ordering.
569       // If it does, we'll have to swap bytes for PRUnichar values.
570       // "BE" and "LE" are the only recognized values, anything
571       // else is garbage and the file will be treated as native-endian
572       // (no swapping).
573       std::string byte_order_value(byte_order);
574       reader.NormalizeValue(&byte_order_value);
575       data.swap_bytes = (byte_order_value == "BE");
576     }
577   }
578 
579   std::vector<history::URLRow> rows;
580   for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
581     AddToHistory(i->second, data, &rows);
582   if (!rows.empty())
583     bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED);
584 }
585