1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <limits>
6 #include <set>
7 #include <string>
8
9 #include "chrome/browser/history/text_database.h"
10
11 #include "app/sql/statement.h"
12 #include "app/sql/transaction.h"
13 #include "base/file_util.h"
14 #include "base/logging.h"
15 #include "base/metrics/histogram.h"
16 #include "base/string_number_conversions.h"
17 #include "base/string_util.h"
18 #include "base/utf_string_conversions.h"
19 #include "chrome/browser/diagnostics/sqlite_diagnostics.h"
20
21 // There are two tables in each database, one full-text search (FTS) table which
22 // indexes the contents and title of the pages. The other is a regular SQLITE
23 // table which contains non-indexed information about the page. All columns of
24 // a FTS table are indexed using the text search algorithm, which isn't what we
25 // want for things like times. If this were in the FTS table, there would be
26 // different words in the index for each time number.
27 //
28 // "pages" FTS table:
29 // url URL of the page so searches will match the URL.
30 // title Title of the page.
31 // body Body of the page.
32 //
33 // "info" regular table:
34 // time Time the corresponding FTS entry was visited.
35 //
36 // We do joins across these two tables by using their internal rowids, which we
37 // keep in sync between the two tables. The internal rowid is the only part of
38 // an FTS table that is indexed like a normal table, and the index over it is
39 // free since sqlite always indexes the internal rowid.
40
41 namespace history {
42
43 namespace {
44
45 // Version 1 uses FTS2 for index files.
46 // Version 2 uses FTS3.
47 static const int kCurrentVersionNumber = 2;
48 static const int kCompatibleVersionNumber = 2;
49
50 // Snippet computation relies on the index of the columns in the original
51 // create statement. These are the 0-based indices (as strings) of the
52 // corresponding columns.
53 const char kTitleColumnIndex[] = "1";
54 const char kBodyColumnIndex[] = "2";
55
56 // The string prepended to the database identifier to generate the filename.
57 const FilePath::CharType kFilePrefix[] = FILE_PATH_LITERAL("History Index ");
58
59 } // namespace
60
Match()61 TextDatabase::Match::Match() {}
62
~Match()63 TextDatabase::Match::~Match() {}
64
TextDatabase(const FilePath & path,DBIdent id,bool allow_create)65 TextDatabase::TextDatabase(const FilePath& path,
66 DBIdent id,
67 bool allow_create)
68 : path_(path),
69 ident_(id),
70 allow_create_(allow_create) {
71 // Compute the file name.
72 file_name_ = path_.Append(IDToFileName(ident_));
73 }
74
~TextDatabase()75 TextDatabase::~TextDatabase() {
76 }
77
78 // static
file_base()79 const FilePath::CharType* TextDatabase::file_base() {
80 return kFilePrefix;
81 }
82
83 // static
IDToFileName(DBIdent id)84 FilePath TextDatabase::IDToFileName(DBIdent id) {
85 // Identifiers are intended to be a combination of the year and month, for
86 // example, 200801 for January 2008. We convert this to
87 // "History Index 2008-01". However, we don't make assumptions about this
88 // scheme: the caller should assign IDs as it feels fit with the knowledge
89 // that they will apppear on disk in this form.
90 FilePath::StringType filename(file_base());
91 base::StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"),
92 id / 100, id % 100);
93 return FilePath(filename);
94 }
95
96 // static
FileNameToID(const FilePath & file_path)97 TextDatabase::DBIdent TextDatabase::FileNameToID(const FilePath& file_path) {
98 FilePath::StringType file_name = file_path.BaseName().value();
99
100 // We don't actually check the prefix here. Since the file system could
101 // be case insensitive in ways we can't predict (NTFS), checking could
102 // potentially be the wrong thing to do. Instead, we just look for a suffix.
103 static const size_t kIDStringLength = 7; // Room for "xxxx-xx".
104 if (file_name.length() < kIDStringLength)
105 return 0;
106 const FilePath::StringType suffix(
107 &file_name[file_name.length() - kIDStringLength]);
108
109 if (suffix.length() != kIDStringLength ||
110 suffix[4] != FILE_PATH_LITERAL('-')) {
111 return 0;
112 }
113
114 int year, month;
115 base::StringToInt(suffix.begin(), suffix.begin() + 4, &year);
116 base::StringToInt(suffix.begin() + 5, suffix.begin() + 7, &month);
117
118 return year * 100 + month;
119 }
120
Init()121 bool TextDatabase::Init() {
122 // Make sure, if we're not allowed to create the file, that it exists.
123 if (!allow_create_) {
124 if (!file_util::PathExists(file_name_))
125 return false;
126 }
127
128 // Set the exceptional sqlite error handler.
129 db_.set_error_delegate(GetErrorHandlerForTextDb());
130
131 // Set the database page size to something a little larger to give us
132 // better performance (we're typically seek rather than bandwidth limited).
133 // This only has an effect before any tables have been created, otherwise
134 // this is a NOP. Must be a power of 2 and a max of 8192.
135 db_.set_page_size(4096);
136
137 // The default cache size is 2000 which give >8MB of data. Since we will often
138 // have 2-3 of these objects, each with their own 8MB, this adds up very fast.
139 // We therefore reduce the size so when there are multiple objects, we're not
140 // too big.
141 db_.set_cache_size(512);
142
143 // Run the database in exclusive mode. Nobody else should be accessing the
144 // database while we're running, and this will give somewhat improved perf.
145 db_.set_exclusive_locking();
146
147 // Attach the database to our index file.
148 if (!db_.Open(file_name_))
149 return false;
150
151 // Meta table tracking version information.
152 if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber))
153 return false;
154 if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) {
155 // This version is too new. We don't bother notifying the user on this
156 // error, and just fail to use the file. Normally if they have version skew,
157 // they will get it for the main history file and it won't be necessary
158 // here. If that's not the case, since this is only indexed data, it's
159 // probably better to just not give FTS results than strange errors when
160 // everything else is working OK.
161 LOG(WARNING) << "Text database is too new.";
162 return false;
163 }
164
165 return CreateTables();
166 }
167
BeginTransaction()168 void TextDatabase::BeginTransaction() {
169 db_.BeginTransaction();
170 }
171
CommitTransaction()172 void TextDatabase::CommitTransaction() {
173 db_.CommitTransaction();
174 }
175
CreateTables()176 bool TextDatabase::CreateTables() {
177 // FTS table of page contents.
178 if (!db_.DoesTableExist("pages")) {
179 if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3("
180 "TOKENIZE icu,"
181 "url LONGVARCHAR,"
182 "title LONGVARCHAR,"
183 "body LONGVARCHAR)"))
184 return false;
185 }
186
187 // Non-FTS table containing URLs and times so we can efficiently find them
188 // using a regular index (all FTS columns are special and are treated as
189 // full-text-search, which is not what we want when retrieving this data).
190 if (!db_.DoesTableExist("info")) {
191 // Note that there is no point in creating an index over time. Since
192 // we must always query the entire FTS table (it can not efficiently do
193 // subsets), we will always end up doing that first, and joining the info
194 // table off of that.
195 if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)"))
196 return false;
197 }
198
199 // Create the index. This will fail when the index already exists, so we just
200 // ignore the error.
201 db_.Execute("CREATE INDEX info_time ON info(time)");
202 return true;
203 }
204
AddPageData(base::Time time,const std::string & url,const std::string & title,const std::string & contents)205 bool TextDatabase::AddPageData(base::Time time,
206 const std::string& url,
207 const std::string& title,
208 const std::string& contents) {
209 sql::Transaction committer(&db_);
210 if (!committer.Begin())
211 return false;
212
213 // Add to the pages table.
214 sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE,
215 "INSERT INTO pages (url, title, body) VALUES (?,?,?)"));
216 if (!add_to_pages) {
217 NOTREACHED() << db_.GetErrorMessage();
218 return false;
219 }
220 add_to_pages.BindString(0, url);
221 add_to_pages.BindString(1, title);
222 add_to_pages.BindString(2, contents);
223 if (!add_to_pages.Run()) {
224 NOTREACHED() << db_.GetErrorMessage();
225 return false;
226 }
227
228 int64 rowid = db_.GetLastInsertRowId();
229
230 // Add to the info table with the same rowid.
231 sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE,
232 "INSERT INTO info (rowid, time) VALUES (?,?)"));
233 if (!add_to_info) {
234 NOTREACHED() << db_.GetErrorMessage();
235 return false;
236 }
237 add_to_info.BindInt64(0, rowid);
238 add_to_info.BindInt64(1, time.ToInternalValue());
239 if (!add_to_info.Run()) {
240 NOTREACHED() << db_.GetErrorMessage();
241 return false;
242 }
243
244 return committer.Commit();
245 }
246
DeletePageData(base::Time time,const std::string & url)247 void TextDatabase::DeletePageData(base::Time time, const std::string& url) {
248 // First get all rows that match. Selecing on time (which has an index) allows
249 // us to avoid brute-force searches on the full-text-index table (there will
250 // generally be only one match per time).
251 sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE,
252 "SELECT info.rowid "
253 "FROM info JOIN pages ON info.rowid = pages.rowid "
254 "WHERE info.time=? AND pages.url=?"));
255 if (!select_ids)
256 return;
257 select_ids.BindInt64(0, time.ToInternalValue());
258 select_ids.BindString(1, url);
259 std::set<int64> rows_to_delete;
260 while (select_ids.Step())
261 rows_to_delete.insert(select_ids.ColumnInt64(0));
262
263 // Delete from the pages table.
264 sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE,
265 "DELETE FROM pages WHERE rowid=?"));
266 if (!delete_page)
267 return;
268 for (std::set<int64>::const_iterator i = rows_to_delete.begin();
269 i != rows_to_delete.end(); ++i) {
270 delete_page.BindInt64(0, *i);
271 if (!delete_page.Run()) {
272 NOTREACHED();
273 return;
274 }
275 delete_page.Reset();
276 }
277
278 // Delete from the info table.
279 sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE,
280 "DELETE FROM info WHERE rowid=?"));
281 if (!delete_info)
282 return;
283 for (std::set<int64>::const_iterator i = rows_to_delete.begin();
284 i != rows_to_delete.end(); ++i) {
285 delete_info.BindInt64(0, *i);
286 if (!delete_info.Run()) {
287 NOTREACHED();
288 return;
289 }
290 delete_info.Reset();
291 }
292 }
293
Optimize()294 void TextDatabase::Optimize() {
295 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,
296 "SELECT OPTIMIZE(pages) FROM pages LIMIT 1"));
297 if (!statement)
298 return;
299 statement.Run();
300 }
301
GetTextMatches(const std::string & query,const QueryOptions & options,std::vector<Match> * results,URLSet * found_urls,base::Time * first_time_searched)302 void TextDatabase::GetTextMatches(const std::string& query,
303 const QueryOptions& options,
304 std::vector<Match>* results,
305 URLSet* found_urls,
306 base::Time* first_time_searched) {
307 *first_time_searched = options.begin_time;
308
309 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,
310 "SELECT url, title, time, offsets(pages), body "
311 "FROM pages LEFT OUTER JOIN info ON pages.rowid = info.rowid "
312 "WHERE pages MATCH ? AND time >= ? AND time < ? "
313 "ORDER BY time DESC "
314 "LIMIT ?"));
315 if (!statement)
316 return;
317
318 // When their values indicate "unspecified", saturate the numbers to the max
319 // or min to get the correct result.
320 int64 effective_begin_time = options.begin_time.is_null() ?
321 0 : options.begin_time.ToInternalValue();
322 int64 effective_end_time = options.end_time.is_null() ?
323 std::numeric_limits<int64>::max() : options.end_time.ToInternalValue();
324 int effective_max_count = options.max_count ?
325 options.max_count : std::numeric_limits<int>::max();
326
327 statement.BindString(0, query);
328 statement.BindInt64(1, effective_begin_time);
329 statement.BindInt64(2, effective_end_time);
330 statement.BindInt(3, effective_max_count);
331
332 while (statement.Step()) {
333 // TODO(brettw) allow canceling the query in the middle.
334 // if (canceled_or_something)
335 // break;
336
337 GURL url(statement.ColumnString(0));
338 URLSet::const_iterator found_url = found_urls->find(url);
339 if (found_url != found_urls->end())
340 continue; // Don't add this duplicate.
341
342 // Fill the results into the vector (avoid copying the URL with Swap()).
343 results->resize(results->size() + 1);
344 Match& match = results->at(results->size() - 1);
345 match.url.Swap(&url);
346
347 match.title = statement.ColumnString16(1);
348 match.time = base::Time::FromInternalValue(statement.ColumnInt64(2));
349
350 // Extract any matches in the title.
351 std::string offsets_str = statement.ColumnString(3);
352 Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex,
353 &match.title_match_positions);
354 Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1),
355 &match.title_match_positions);
356
357 // Extract the matches in the body.
358 Snippet::MatchPositions match_positions;
359 Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex,
360 &match_positions);
361
362 // Compute the snippet based on those matches.
363 std::string body = statement.ColumnString(4);
364 match.snippet.ComputeSnippet(match_positions, body);
365 }
366
367 // When we have returned all the results possible (or determined that there
368 // are none), then we have searched all the time requested, so we can
369 // set the first_time_searched to that value.
370 if (results->empty() ||
371 options.max_count == 0 || // Special case for wanting all the results.
372 static_cast<int>(results->size()) < options.max_count) {
373 *first_time_searched = options.begin_time;
374 } else {
375 // Since we got the results in order, we know the last item is the last
376 // time we considered.
377 *first_time_searched = results->back().time;
378 }
379
380 statement.Reset();
381 }
382
383 } // namespace history
384