1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/history/text_database_manager.h"
6
7 #include "base/compiler_specific.h"
8 #include "base/file_util.h"
9 #include "base/metrics/histogram.h"
10 #include "base/logging.h"
11 #include "base/message_loop.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "chrome/browser/history/history_publisher.h"
15 #include "chrome/browser/history/visit_database.h"
16 #include "content/common/mru_cache.h"
17
18 using base::Time;
19 using base::TimeDelta;
20 using base::TimeTicks;
21
22 namespace history {
23
24 namespace {
25
26 // The number of database files we will be attached to at once.
27 const int kCacheDBSize = 5;
28
ConvertStringForIndexer(const string16 & input)29 std::string ConvertStringForIndexer(const string16& input) {
30 // TODO(evanm): other transformations here?
31 return UTF16ToUTF8(CollapseWhitespace(input, false));
32 }
33
34 // Data older than this will be committed to the full text index even if we
35 // haven't gotten a title and/or body.
36 const int kExpirationSec = 20;
37
38 } // namespace
39
40 // TextDatabaseManager::ChangeSet ----------------------------------------------
41
ChangeSet()42 TextDatabaseManager::ChangeSet::ChangeSet() {}
43
~ChangeSet()44 TextDatabaseManager::ChangeSet::~ChangeSet() {}
45
46 // TextDatabaseManager::PageInfo -----------------------------------------------
47
PageInfo(URLID url_id,VisitID visit_id,Time visit_time)48 TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
49 VisitID visit_id,
50 Time visit_time)
51 : url_id_(url_id),
52 visit_id_(visit_id),
53 visit_time_(visit_time) {
54 added_time_ = TimeTicks::Now();
55 }
56
~PageInfo()57 TextDatabaseManager::PageInfo::~PageInfo() {}
58
set_title(const string16 & ttl)59 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {
60 if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet.
61 title_ = ASCIIToUTF16(" ");
62 else
63 title_ = ttl;
64 }
65
set_body(const string16 & bdy)66 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {
67 if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet.
68 body_ = ASCIIToUTF16(" ");
69 else
70 body_ = bdy;
71 }
72
Expired(TimeTicks now) const73 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
74 return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec);
75 }
76
77 // TextDatabaseManager ---------------------------------------------------------
78
TextDatabaseManager(const FilePath & dir,URLDatabase * url_database,VisitDatabase * visit_database)79 TextDatabaseManager::TextDatabaseManager(const FilePath& dir,
80 URLDatabase* url_database,
81 VisitDatabase* visit_database)
82 : dir_(dir),
83 url_database_(url_database),
84 visit_database_(visit_database),
85 recent_changes_(RecentChangeList::NO_AUTO_EVICT),
86 transaction_nesting_(0),
87 db_cache_(DBCache::NO_AUTO_EVICT),
88 present_databases_loaded_(false),
89 ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)),
90 history_publisher_(NULL) {
91 }
92
~TextDatabaseManager()93 TextDatabaseManager::~TextDatabaseManager() {
94 if (transaction_nesting_)
95 CommitTransaction();
96 }
97
98 // static
TimeToID(Time time)99 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
100 Time::Exploded exploded;
101 time.UTCExplode(&exploded);
102
103 // We combine the month and year into a 6-digit number (200801 for
104 // January, 2008). The month is 1-based.
105 return exploded.year * 100 + exploded.month;
106 }
107
108 // static
IDToTime(TextDatabase::DBIdent id)109 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
110 Time::Exploded exploded;
111 memset(&exploded, 0, sizeof(Time::Exploded));
112 exploded.year = id / 100;
113 exploded.month = id % 100;
114 return Time::FromUTCExploded(exploded);
115 }
116
Init(const HistoryPublisher * history_publisher)117 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
118 history_publisher_ = history_publisher;
119
120 // Start checking recent changes and committing them.
121 ScheduleFlushOldChanges();
122 return true;
123 }
124
BeginTransaction()125 void TextDatabaseManager::BeginTransaction() {
126 transaction_nesting_++;
127 }
128
CommitTransaction()129 void TextDatabaseManager::CommitTransaction() {
130 DCHECK(transaction_nesting_);
131 transaction_nesting_--;
132 if (transaction_nesting_)
133 return; // Still more nesting of transactions before committing.
134
135 // Commit all databases with open transactions on them.
136 for (DBIdentSet::const_iterator i = open_transactions_.begin();
137 i != open_transactions_.end(); ++i) {
138 DBCache::iterator iter = db_cache_.Get(*i);
139 if (iter == db_cache_.end()) {
140 NOTREACHED() << "All open transactions should be cached.";
141 continue;
142 }
143 iter->second->CommitTransaction();
144 }
145 open_transactions_.clear();
146
147 // Now that the transaction is over, we can expire old connections.
148 db_cache_.ShrinkToSize(kCacheDBSize);
149 }
150
InitDBList()151 void TextDatabaseManager::InitDBList() {
152 if (present_databases_loaded_)
153 return;
154
155 present_databases_loaded_ = true;
156
157 // Find files on disk matching our pattern so we can quickly test for them.
158 FilePath::StringType filepattern(TextDatabase::file_base());
159 filepattern.append(FILE_PATH_LITERAL("*"));
160 file_util::FileEnumerator enumerator(
161 dir_, false, file_util::FileEnumerator::FILES, filepattern);
162 FilePath cur_file;
163 while (!(cur_file = enumerator.Next()).empty()) {
164 // Convert to the number representing this file.
165 TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
166 if (id) // Will be 0 on error.
167 present_databases_.insert(id);
168 }
169 }
170
AddPageURL(const GURL & url,URLID url_id,VisitID visit_id,Time time)171 void TextDatabaseManager::AddPageURL(const GURL& url,
172 URLID url_id,
173 VisitID visit_id,
174 Time time) {
175 // Delete any existing page info.
176 RecentChangeList::iterator found = recent_changes_.Peek(url);
177 if (found != recent_changes_.end())
178 recent_changes_.Erase(found);
179
180 // Just save this info for later. We will save it when it expires or when all
181 // the data is complete.
182 recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
183 }
184
AddPageTitle(const GURL & url,const string16 & title)185 void TextDatabaseManager::AddPageTitle(const GURL& url,
186 const string16& title) {
187 RecentChangeList::iterator found = recent_changes_.Peek(url);
188 if (found == recent_changes_.end()) {
189 // This page is not in our cache of recent pages. This is very much an edge
190 // case as normally a title will come in <20 seconds after the page commits,
191 // and TabContents will avoid spamming us with >1 title per page. However,
192 // it could come up if your connection is unhappy, and we don't want to
193 // miss anything.
194 //
195 // To solve this problem, we'll just associate the most recent visit with
196 // the new title and index that using the regular code path.
197 URLRow url_row;
198 if (!url_database_->GetRowForURL(url, &url_row))
199 return; // URL is unknown, give up.
200 VisitRow visit;
201 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
202 return; // No recent visit, give up.
203
204 if (visit.is_indexed) {
205 // If this page was already indexed, we could have a body that came in
206 // first and we don't want to overwrite it. We could go query for the
207 // current body, or have a special setter for only the title, but this is
208 // not worth it for this edge case.
209 //
210 // It will be almost impossible for the title to take longer than
211 // kExpirationSec yet we got a body in less than that time, since the
212 // title should always come in first.
213 return;
214 }
215
216 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
217 title, string16());
218 return; // We don't know about this page, give up.
219 }
220
221 PageInfo& info = found->second;
222 if (info.has_body()) {
223 // This info is complete, write to the database.
224 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
225 title, info.body());
226 recent_changes_.Erase(found);
227 return;
228 }
229
230 info.set_title(title);
231 }
232
AddPageContents(const GURL & url,const string16 & body)233 void TextDatabaseManager::AddPageContents(const GURL& url,
234 const string16& body) {
235 RecentChangeList::iterator found = recent_changes_.Peek(url);
236 if (found == recent_changes_.end()) {
237 // This page is not in our cache of recent pages. This means that the page
238 // took more than kExpirationSec to load. Often, this will be the result of
239 // a very slow iframe or other resource on the page that makes us think its
240 // still loading.
241 //
242 // As a fallback, set the most recent visit's contents using the input, and
243 // use the last set title in the URL table as the title to index.
244 URLRow url_row;
245 if (!url_database_->GetRowForURL(url, &url_row))
246 return; // URL is unknown, give up.
247 VisitRow visit;
248 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
249 return; // No recent visit, give up.
250
251 // Use the title from the URL row as the title for the indexing.
252 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
253 url_row.title(), body);
254 return;
255 }
256
257 PageInfo& info = found->second;
258 if (info.has_title()) {
259 // This info is complete, write to the database.
260 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
261 info.title(), body);
262 recent_changes_.Erase(found);
263 return;
264 }
265
266 info.set_body(body);
267 }
268
AddPageData(const GURL & url,URLID url_id,VisitID visit_id,Time visit_time,const string16 & title,const string16 & body)269 bool TextDatabaseManager::AddPageData(const GURL& url,
270 URLID url_id,
271 VisitID visit_id,
272 Time visit_time,
273 const string16& title,
274 const string16& body) {
275 TextDatabase* db = GetDBForTime(visit_time, true);
276 if (!db)
277 return false;
278
279 TimeTicks beginning_time = TimeTicks::Now();
280
281 // First delete any recently-indexed data for this page. This will delete
282 // anything in the main database, but we don't bother looking through the
283 // archived database.
284 VisitVector visits;
285 visit_database_->GetVisitsForURL(url_id, &visits);
286 size_t our_visit_row_index = visits.size();
287 for (size_t i = 0; i < visits.size(); i++) {
288 // While we're going trough all the visits, also find our row so we can
289 // avoid another DB query.
290 if (visits[i].visit_id == visit_id) {
291 our_visit_row_index = i;
292 } else if (visits[i].is_indexed) {
293 visits[i].is_indexed = false;
294 visit_database_->UpdateVisitRow(visits[i]);
295 DeletePageData(visits[i].visit_time, url, NULL);
296 }
297 }
298
299 if (visit_id) {
300 // We're supposed to update the visit database.
301 if (our_visit_row_index >= visits.size()) {
302 NOTREACHED() << "We should always have found a visit when given an ID.";
303 return false;
304 }
305
306 DCHECK(visit_time == visits[our_visit_row_index].visit_time);
307
308 // Update the visit database to reference our addition.
309 visits[our_visit_row_index].is_indexed = true;
310 if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index]))
311 return false;
312 }
313
314 // Now index the data.
315 std::string url_str = URLDatabase::GURLToDatabaseURL(url);
316 bool success = db->AddPageData(visit_time, url_str,
317 ConvertStringForIndexer(title),
318 ConvertStringForIndexer(body));
319
320 UMA_HISTOGRAM_TIMES("History.AddFTSData",
321 TimeTicks::Now() - beginning_time);
322
323 if (history_publisher_)
324 history_publisher_->PublishPageContent(visit_time, url, title, body);
325
326 return success;
327 }
328
DeletePageData(Time time,const GURL & url,ChangeSet * change_set)329 void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
330 ChangeSet* change_set) {
331 TextDatabase::DBIdent db_ident = TimeToID(time);
332
333 // We want to open the database for writing, but only if it exists. To
334 // achieve this, we check whether it exists by saying we're not going to
335 // write to it (avoiding the autocreation code normally called when writing)
336 // and then access it for writing only if it succeeds.
337 TextDatabase* db = GetDB(db_ident, false);
338 if (!db)
339 return;
340 db = GetDB(db_ident, true);
341
342 if (change_set)
343 change_set->Add(db_ident);
344
345 db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
346 }
347
DeleteFromUncommitted(const std::set<GURL> & restrict_urls,Time begin,Time end)348 void TextDatabaseManager::DeleteFromUncommitted(
349 const std::set<GURL>& restrict_urls, Time begin, Time end) {
350 // First find the beginning of the range to delete. Recall that the list
351 // has the most recent item at the beginning. There won't normally be very
352 // many items, so a brute-force search is fine.
353 RecentChangeList::iterator cur = recent_changes_.begin();
354 if (!end.is_null()) {
355 // Walk from the beginning of the list backwards in time to find the newest
356 // entry that should be deleted.
357 while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
358 ++cur;
359 }
360
361 // Now delete all visits up to the oldest one we were supposed to delete.
362 // Note that if begin is_null, it will be less than or equal to any other
363 // time.
364 if (restrict_urls.empty()) {
365 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
366 cur = recent_changes_.Erase(cur);
367 } else {
368 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {
369 if (restrict_urls.find(cur->first) != restrict_urls.end())
370 cur = recent_changes_.Erase(cur);
371 else
372 ++cur;
373 }
374 }
375 }
376
DeleteAll()377 void TextDatabaseManager::DeleteAll() {
378 DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";
379
380 InitDBList();
381
382 // Close all open databases.
383 db_cache_.Clear();
384
385 // Now go through and delete all the files.
386 for (DBIdentSet::iterator i = present_databases_.begin();
387 i != present_databases_.end(); ++i) {
388 FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));
389 file_util::Delete(file_name, false);
390 }
391 }
392
OptimizeChangedDatabases(const ChangeSet & change_set)393 void TextDatabaseManager::OptimizeChangedDatabases(
394 const ChangeSet& change_set) {
395 for (ChangeSet::DBSet::const_iterator i =
396 change_set.changed_databases_.begin();
397 i != change_set.changed_databases_.end(); ++i) {
398 // We want to open the database for writing, but only if it exists. To
399 // achieve this, we check whether it exists by saying we're not going to
400 // write to it (avoiding the autocreation code normally called when writing)
401 // and then access it for writing only if it succeeds.
402 TextDatabase* db = GetDB(*i, false);
403 if (!db)
404 continue;
405 db = GetDB(*i, true);
406 if (!db)
407 continue; // The file may have changed or something.
408 db->Optimize();
409 }
410 }
411
GetTextMatches(const string16 & query,const QueryOptions & options,std::vector<TextDatabase::Match> * results,Time * first_time_searched)412 void TextDatabaseManager::GetTextMatches(
413 const string16& query,
414 const QueryOptions& options,
415 std::vector<TextDatabase::Match>* results,
416 Time* first_time_searched) {
417 results->clear();
418
419 InitDBList();
420 if (present_databases_.empty()) {
421 // Nothing to search.
422 *first_time_searched = options.begin_time;
423 return;
424 }
425
426 // Get the query into the proper format for the individual DBs.
427 string16 fts_query16;
428 query_parser_.ParseQuery(query, &fts_query16);
429 std::string fts_query = UTF16ToUTF8(fts_query16);
430
431 // Need a copy of the options so we can modify the max count for each call
432 // to the individual databases.
433 QueryOptions cur_options(options);
434
435 // Compute the minimum and maximum values for the identifiers that could
436 // encompass the input time range.
437 TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
438 *present_databases_.begin() :
439 TimeToID(options.begin_time);
440 TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
441 *present_databases_.rbegin() :
442 TimeToID(options.end_time);
443
444 // Iterate over the databases from the most recent backwards.
445 bool checked_one = false;
446 TextDatabase::URLSet found_urls;
447 for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
448 i != present_databases_.rend();
449 ++i) {
450 // TODO(brettw) allow canceling the query in the middle.
451 // if (canceled_or_something)
452 // break;
453
454 // This code is stupid, we just loop until we find the correct starting
455 // time range rather than search in an intelligent way. Users will have a
456 // few dozen files at most, so this should not be an issue.
457 if (*i > max_ident)
458 continue; // Haven't gotten to the time range yet.
459 if (*i < min_ident)
460 break; // Covered all the time range.
461
462 TextDatabase* cur_db = GetDB(*i, false);
463 if (!cur_db)
464 continue;
465
466 // Adjust the max count according to how many results we've already got.
467 if (options.max_count) {
468 cur_options.max_count = options.max_count -
469 static_cast<int>(results->size());
470 }
471
472 // Since we are going backwards in time, it is always OK to pass the
473 // current first_time_searched, since it will always be smaller than
474 // any previous set.
475 cur_db->GetTextMatches(fts_query, cur_options,
476 results, &found_urls, first_time_searched);
477 checked_one = true;
478
479 DCHECK(options.max_count == 0 ||
480 static_cast<int>(results->size()) <= options.max_count);
481 if (options.max_count &&
482 static_cast<int>(results->size()) >= options.max_count)
483 break; // Got the max number of results.
484 }
485
486 // When there were no databases in the range, we need to fix up the min time.
487 if (!checked_one)
488 *first_time_searched = options.begin_time;
489 }
490
GetDB(TextDatabase::DBIdent id,bool for_writing)491 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
492 bool for_writing) {
493 DBCache::iterator found_db = db_cache_.Get(id);
494 if (found_db != db_cache_.end()) {
495 if (transaction_nesting_ && for_writing &&
496 open_transactions_.find(id) == open_transactions_.end()) {
497 // If we currently have an open transaction, that database is not yet
498 // part of the transaction, and the database will be written to, it needs
499 // to be part of our transaction.
500 found_db->second->BeginTransaction();
501 open_transactions_.insert(id);
502 }
503 return found_db->second;
504 }
505
506 // Need to make the database.
507 TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
508 if (!new_db->Init()) {
509 delete new_db;
510 return NULL;
511 }
512 db_cache_.Put(id, new_db);
513 present_databases_.insert(id);
514
515 if (transaction_nesting_ && for_writing) {
516 // If we currently have an open transaction and the new database will be
517 // written to, it needs to be part of our transaction.
518 new_db->BeginTransaction();
519 open_transactions_.insert(id);
520 }
521
522 // When no transaction is open, allow this new one to kick out an old one.
523 if (!transaction_nesting_)
524 db_cache_.ShrinkToSize(kCacheDBSize);
525
526 return new_db;
527 }
528
GetDBForTime(Time time,bool create_if_necessary)529 TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
530 bool create_if_necessary) {
531 return GetDB(TimeToID(time), create_if_necessary);
532 }
533
ScheduleFlushOldChanges()534 void TextDatabaseManager::ScheduleFlushOldChanges() {
535 factory_.RevokeAll();
536 MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod(
537 &TextDatabaseManager::FlushOldChanges),
538 kExpirationSec * Time::kMillisecondsPerSecond);
539 }
540
FlushOldChanges()541 void TextDatabaseManager::FlushOldChanges() {
542 FlushOldChangesForTime(TimeTicks::Now());
543 }
544
FlushOldChangesForTime(TimeTicks now)545 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
546 // The end of the list is the oldest, so we just start from there committing
547 // things until we get something too new.
548 RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
549 while (i != recent_changes_.rend() && i->second.Expired(now)) {
550 AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
551 i->second.visit_time(), i->second.title(), i->second.body());
552 i = recent_changes_.Erase(i);
553 }
554
555 ScheduleFlushOldChanges();
556 }
557
558 } // namespace history
559