• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/history/text_database_manager.h"
6 
7 #include "base/compiler_specific.h"
8 #include "base/file_util.h"
9 #include "base/metrics/histogram.h"
10 #include "base/logging.h"
11 #include "base/message_loop.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "chrome/browser/history/history_publisher.h"
15 #include "chrome/browser/history/visit_database.h"
16 #include "content/common/mru_cache.h"
17 
18 using base::Time;
19 using base::TimeDelta;
20 using base::TimeTicks;
21 
22 namespace history {
23 
24 namespace {
25 
26 // The number of database files we will be attached to at once.
27 const int kCacheDBSize = 5;
28 
ConvertStringForIndexer(const string16 & input)29 std::string ConvertStringForIndexer(const string16& input) {
30   // TODO(evanm): other transformations here?
31   return UTF16ToUTF8(CollapseWhitespace(input, false));
32 }
33 
34 // Data older than this will be committed to the full text index even if we
35 // haven't gotten a title and/or body.
36 const int kExpirationSec = 20;
37 
38 }  // namespace
39 
40 // TextDatabaseManager::ChangeSet ----------------------------------------------
41 
ChangeSet()42 TextDatabaseManager::ChangeSet::ChangeSet() {}
43 
~ChangeSet()44 TextDatabaseManager::ChangeSet::~ChangeSet() {}
45 
46 // TextDatabaseManager::PageInfo -----------------------------------------------
47 
PageInfo(URLID url_id,VisitID visit_id,Time visit_time)48 TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
49                                         VisitID visit_id,
50                                         Time visit_time)
51     : url_id_(url_id),
52       visit_id_(visit_id),
53       visit_time_(visit_time) {
54   added_time_ = TimeTicks::Now();
55 }
56 
~PageInfo()57 TextDatabaseManager::PageInfo::~PageInfo() {}
58 
set_title(const string16 & ttl)59 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {
60   if (ttl.empty())  // Make the title nonempty when we set it for EverybodySet.
61     title_ = ASCIIToUTF16(" ");
62   else
63     title_ = ttl;
64 }
65 
set_body(const string16 & bdy)66 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {
67   if (bdy.empty())  // Make the body nonempty when we set it for EverybodySet.
68     body_ = ASCIIToUTF16(" ");
69   else
70     body_ = bdy;
71 }
72 
Expired(TimeTicks now) const73 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
74   return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec);
75 }
76 
77 // TextDatabaseManager ---------------------------------------------------------
78 
TextDatabaseManager(const FilePath & dir,URLDatabase * url_database,VisitDatabase * visit_database)79 TextDatabaseManager::TextDatabaseManager(const FilePath& dir,
80                                          URLDatabase* url_database,
81                                          VisitDatabase* visit_database)
82     : dir_(dir),
83       url_database_(url_database),
84       visit_database_(visit_database),
85       recent_changes_(RecentChangeList::NO_AUTO_EVICT),
86       transaction_nesting_(0),
87       db_cache_(DBCache::NO_AUTO_EVICT),
88       present_databases_loaded_(false),
89       ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)),
90       history_publisher_(NULL) {
91 }
92 
~TextDatabaseManager()93 TextDatabaseManager::~TextDatabaseManager() {
94   if (transaction_nesting_)
95     CommitTransaction();
96 }
97 
98 // static
TimeToID(Time time)99 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
100   Time::Exploded exploded;
101   time.UTCExplode(&exploded);
102 
103   // We combine the month and year into a 6-digit number (200801 for
104   // January, 2008). The month is 1-based.
105   return exploded.year * 100 + exploded.month;
106 }
107 
108 // static
IDToTime(TextDatabase::DBIdent id)109 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
110   Time::Exploded exploded;
111   memset(&exploded, 0, sizeof(Time::Exploded));
112   exploded.year = id / 100;
113   exploded.month = id % 100;
114   return Time::FromUTCExploded(exploded);
115 }
116 
Init(const HistoryPublisher * history_publisher)117 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
118   history_publisher_ = history_publisher;
119 
120   // Start checking recent changes and committing them.
121   ScheduleFlushOldChanges();
122   return true;
123 }
124 
BeginTransaction()125 void TextDatabaseManager::BeginTransaction() {
126   transaction_nesting_++;
127 }
128 
CommitTransaction()129 void TextDatabaseManager::CommitTransaction() {
130   DCHECK(transaction_nesting_);
131   transaction_nesting_--;
132   if (transaction_nesting_)
133     return;  // Still more nesting of transactions before committing.
134 
135   // Commit all databases with open transactions on them.
136   for (DBIdentSet::const_iterator i = open_transactions_.begin();
137        i != open_transactions_.end(); ++i) {
138     DBCache::iterator iter = db_cache_.Get(*i);
139     if (iter == db_cache_.end()) {
140       NOTREACHED() << "All open transactions should be cached.";
141       continue;
142     }
143     iter->second->CommitTransaction();
144   }
145   open_transactions_.clear();
146 
147   // Now that the transaction is over, we can expire old connections.
148   db_cache_.ShrinkToSize(kCacheDBSize);
149 }
150 
InitDBList()151 void TextDatabaseManager::InitDBList() {
152   if (present_databases_loaded_)
153     return;
154 
155   present_databases_loaded_ = true;
156 
157   // Find files on disk matching our pattern so we can quickly test for them.
158   FilePath::StringType filepattern(TextDatabase::file_base());
159   filepattern.append(FILE_PATH_LITERAL("*"));
160   file_util::FileEnumerator enumerator(
161       dir_, false, file_util::FileEnumerator::FILES, filepattern);
162   FilePath cur_file;
163   while (!(cur_file = enumerator.Next()).empty()) {
164     // Convert to the number representing this file.
165     TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
166     if (id)  // Will be 0 on error.
167       present_databases_.insert(id);
168   }
169 }
170 
AddPageURL(const GURL & url,URLID url_id,VisitID visit_id,Time time)171 void TextDatabaseManager::AddPageURL(const GURL& url,
172                                      URLID url_id,
173                                      VisitID visit_id,
174                                      Time time) {
175   // Delete any existing page info.
176   RecentChangeList::iterator found = recent_changes_.Peek(url);
177   if (found != recent_changes_.end())
178     recent_changes_.Erase(found);
179 
180   // Just save this info for later. We will save it when it expires or when all
181   // the data is complete.
182   recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
183 }
184 
AddPageTitle(const GURL & url,const string16 & title)185 void TextDatabaseManager::AddPageTitle(const GURL& url,
186                                        const string16& title) {
187   RecentChangeList::iterator found = recent_changes_.Peek(url);
188   if (found == recent_changes_.end()) {
189     // This page is not in our cache of recent pages. This is very much an edge
190     // case as normally a title will come in <20 seconds after the page commits,
191     // and TabContents will avoid spamming us with >1 title per page. However,
192     // it could come up if your connection is unhappy, and we don't want to
193     // miss anything.
194     //
195     // To solve this problem, we'll just associate the most recent visit with
196     // the new title and index that using the regular code path.
197     URLRow url_row;
198     if (!url_database_->GetRowForURL(url, &url_row))
199       return;  // URL is unknown, give up.
200     VisitRow visit;
201     if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
202       return;  // No recent visit, give up.
203 
204     if (visit.is_indexed) {
205       // If this page was already indexed, we could have a body that came in
206       // first and we don't want to overwrite it. We could go query for the
207       // current body, or have a special setter for only the title, but this is
208       // not worth it for this edge case.
209       //
210       // It will be almost impossible for the title to take longer than
211       // kExpirationSec yet we got a body in less than that time, since the
212       // title should always come in first.
213       return;
214     }
215 
216     AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
217                 title, string16());
218     return;  // We don't know about this page, give up.
219   }
220 
221   PageInfo& info = found->second;
222   if (info.has_body()) {
223     // This info is complete, write to the database.
224     AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
225                 title, info.body());
226     recent_changes_.Erase(found);
227     return;
228   }
229 
230   info.set_title(title);
231 }
232 
AddPageContents(const GURL & url,const string16 & body)233 void TextDatabaseManager::AddPageContents(const GURL& url,
234                                           const string16& body) {
235   RecentChangeList::iterator found = recent_changes_.Peek(url);
236   if (found == recent_changes_.end()) {
237     // This page is not in our cache of recent pages. This means that the page
238     // took more than kExpirationSec to load. Often, this will be the result of
239     // a very slow iframe or other resource on the page that makes us think its
240     // still loading.
241     //
242     // As a fallback, set the most recent visit's contents using the input, and
243     // use the last set title in the URL table as the title to index.
244     URLRow url_row;
245     if (!url_database_->GetRowForURL(url, &url_row))
246       return;  // URL is unknown, give up.
247     VisitRow visit;
248     if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
249       return;  // No recent visit, give up.
250 
251     // Use the title from the URL row as the title for the indexing.
252     AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
253                 url_row.title(), body);
254     return;
255   }
256 
257   PageInfo& info = found->second;
258   if (info.has_title()) {
259     // This info is complete, write to the database.
260     AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
261                 info.title(), body);
262     recent_changes_.Erase(found);
263     return;
264   }
265 
266   info.set_body(body);
267 }
268 
AddPageData(const GURL & url,URLID url_id,VisitID visit_id,Time visit_time,const string16 & title,const string16 & body)269 bool TextDatabaseManager::AddPageData(const GURL& url,
270                                       URLID url_id,
271                                       VisitID visit_id,
272                                       Time visit_time,
273                                       const string16& title,
274                                       const string16& body) {
275   TextDatabase* db = GetDBForTime(visit_time, true);
276   if (!db)
277     return false;
278 
279   TimeTicks beginning_time = TimeTicks::Now();
280 
281   // First delete any recently-indexed data for this page. This will delete
282   // anything in the main database, but we don't bother looking through the
283   // archived database.
284   VisitVector visits;
285   visit_database_->GetVisitsForURL(url_id, &visits);
286   size_t our_visit_row_index = visits.size();
287   for (size_t i = 0; i < visits.size(); i++) {
288     // While we're going trough all the visits, also find our row so we can
289     // avoid another DB query.
290     if (visits[i].visit_id == visit_id) {
291       our_visit_row_index = i;
292     } else if (visits[i].is_indexed) {
293       visits[i].is_indexed = false;
294       visit_database_->UpdateVisitRow(visits[i]);
295       DeletePageData(visits[i].visit_time, url, NULL);
296     }
297   }
298 
299   if (visit_id) {
300     // We're supposed to update the visit database.
301     if (our_visit_row_index >= visits.size()) {
302       NOTREACHED() << "We should always have found a visit when given an ID.";
303       return false;
304     }
305 
306     DCHECK(visit_time == visits[our_visit_row_index].visit_time);
307 
308     // Update the visit database to reference our addition.
309     visits[our_visit_row_index].is_indexed = true;
310     if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index]))
311       return false;
312   }
313 
314   // Now index the data.
315   std::string url_str = URLDatabase::GURLToDatabaseURL(url);
316   bool success = db->AddPageData(visit_time, url_str,
317                                  ConvertStringForIndexer(title),
318                                  ConvertStringForIndexer(body));
319 
320   UMA_HISTOGRAM_TIMES("History.AddFTSData",
321                       TimeTicks::Now() - beginning_time);
322 
323   if (history_publisher_)
324     history_publisher_->PublishPageContent(visit_time, url, title, body);
325 
326   return success;
327 }
328 
DeletePageData(Time time,const GURL & url,ChangeSet * change_set)329 void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
330                                          ChangeSet* change_set) {
331   TextDatabase::DBIdent db_ident = TimeToID(time);
332 
333   // We want to open the database for writing, but only if it exists. To
334   // achieve this, we check whether it exists by saying we're not going to
335   // write to it (avoiding the autocreation code normally called when writing)
336   // and then access it for writing only if it succeeds.
337   TextDatabase* db = GetDB(db_ident, false);
338   if (!db)
339     return;
340   db = GetDB(db_ident, true);
341 
342   if (change_set)
343     change_set->Add(db_ident);
344 
345   db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
346 }
347 
DeleteFromUncommitted(const std::set<GURL> & restrict_urls,Time begin,Time end)348 void TextDatabaseManager::DeleteFromUncommitted(
349     const std::set<GURL>& restrict_urls, Time begin, Time end) {
350   // First find the beginning of the range to delete. Recall that the list
351   // has the most recent item at the beginning. There won't normally be very
352   // many items, so a brute-force search is fine.
353   RecentChangeList::iterator cur = recent_changes_.begin();
354   if (!end.is_null()) {
355     // Walk from the beginning of the list backwards in time to find the newest
356     // entry that should be deleted.
357     while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
358       ++cur;
359   }
360 
361   // Now delete all visits up to the oldest one we were supposed to delete.
362   // Note that if begin is_null, it will be less than or equal to any other
363   // time.
364   if (restrict_urls.empty()) {
365     while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
366       cur = recent_changes_.Erase(cur);
367   } else {
368     while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {
369       if (restrict_urls.find(cur->first) != restrict_urls.end())
370         cur = recent_changes_.Erase(cur);
371       else
372         ++cur;
373     }
374   }
375 }
376 
DeleteAll()377 void TextDatabaseManager::DeleteAll() {
378   DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";
379 
380   InitDBList();
381 
382   // Close all open databases.
383   db_cache_.Clear();
384 
385   // Now go through and delete all the files.
386   for (DBIdentSet::iterator i = present_databases_.begin();
387        i != present_databases_.end(); ++i) {
388     FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));
389     file_util::Delete(file_name, false);
390   }
391 }
392 
OptimizeChangedDatabases(const ChangeSet & change_set)393 void TextDatabaseManager::OptimizeChangedDatabases(
394     const ChangeSet& change_set) {
395   for (ChangeSet::DBSet::const_iterator i =
396            change_set.changed_databases_.begin();
397        i != change_set.changed_databases_.end(); ++i) {
398     // We want to open the database for writing, but only if it exists. To
399     // achieve this, we check whether it exists by saying we're not going to
400     // write to it (avoiding the autocreation code normally called when writing)
401     // and then access it for writing only if it succeeds.
402     TextDatabase* db = GetDB(*i, false);
403     if (!db)
404       continue;
405     db = GetDB(*i, true);
406     if (!db)
407       continue;  // The file may have changed or something.
408     db->Optimize();
409   }
410 }
411 
GetTextMatches(const string16 & query,const QueryOptions & options,std::vector<TextDatabase::Match> * results,Time * first_time_searched)412 void TextDatabaseManager::GetTextMatches(
413     const string16& query,
414     const QueryOptions& options,
415     std::vector<TextDatabase::Match>* results,
416     Time* first_time_searched) {
417   results->clear();
418 
419   InitDBList();
420   if (present_databases_.empty()) {
421     // Nothing to search.
422     *first_time_searched = options.begin_time;
423     return;
424   }
425 
426   // Get the query into the proper format for the individual DBs.
427   string16 fts_query16;
428   query_parser_.ParseQuery(query, &fts_query16);
429   std::string fts_query = UTF16ToUTF8(fts_query16);
430 
431   // Need a copy of the options so we can modify the max count for each call
432   // to the individual databases.
433   QueryOptions cur_options(options);
434 
435   // Compute the minimum and maximum values for the identifiers that could
436   // encompass the input time range.
437   TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
438       *present_databases_.begin() :
439       TimeToID(options.begin_time);
440   TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
441       *present_databases_.rbegin() :
442       TimeToID(options.end_time);
443 
444   // Iterate over the databases from the most recent backwards.
445   bool checked_one = false;
446   TextDatabase::URLSet found_urls;
447   for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
448        i != present_databases_.rend();
449        ++i) {
450     // TODO(brettw) allow canceling the query in the middle.
451     // if (canceled_or_something)
452     //   break;
453 
454     // This code is stupid, we just loop until we find the correct starting
455     // time range rather than search in an intelligent way. Users will have a
456     // few dozen files at most, so this should not be an issue.
457     if (*i > max_ident)
458       continue;  // Haven't gotten to the time range yet.
459     if (*i < min_ident)
460       break;  // Covered all the time range.
461 
462     TextDatabase* cur_db = GetDB(*i, false);
463     if (!cur_db)
464       continue;
465 
466     // Adjust the max count according to how many results we've already got.
467     if (options.max_count) {
468       cur_options.max_count = options.max_count -
469           static_cast<int>(results->size());
470     }
471 
472     // Since we are going backwards in time, it is always OK to pass the
473     // current first_time_searched, since it will always be smaller than
474     // any previous set.
475     cur_db->GetTextMatches(fts_query, cur_options,
476                            results, &found_urls, first_time_searched);
477     checked_one = true;
478 
479     DCHECK(options.max_count == 0 ||
480            static_cast<int>(results->size()) <= options.max_count);
481     if (options.max_count &&
482         static_cast<int>(results->size()) >= options.max_count)
483       break;  // Got the max number of results.
484   }
485 
486   // When there were no databases in the range, we need to fix up the min time.
487   if (!checked_one)
488     *first_time_searched = options.begin_time;
489 }
490 
GetDB(TextDatabase::DBIdent id,bool for_writing)491 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
492                                          bool for_writing) {
493   DBCache::iterator found_db = db_cache_.Get(id);
494   if (found_db != db_cache_.end()) {
495     if (transaction_nesting_ && for_writing &&
496         open_transactions_.find(id) == open_transactions_.end()) {
497       // If we currently have an open transaction, that database is not yet
498       // part of the transaction, and the database will be written to, it needs
499       // to be part of our transaction.
500       found_db->second->BeginTransaction();
501       open_transactions_.insert(id);
502     }
503     return found_db->second;
504   }
505 
506   // Need to make the database.
507   TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
508   if (!new_db->Init()) {
509     delete new_db;
510     return NULL;
511   }
512   db_cache_.Put(id, new_db);
513   present_databases_.insert(id);
514 
515   if (transaction_nesting_ && for_writing) {
516     // If we currently have an open transaction and the new database will be
517     // written to, it needs to be part of our transaction.
518     new_db->BeginTransaction();
519     open_transactions_.insert(id);
520   }
521 
522   // When no transaction is open, allow this new one to kick out an old one.
523   if (!transaction_nesting_)
524     db_cache_.ShrinkToSize(kCacheDBSize);
525 
526   return new_db;
527 }
528 
GetDBForTime(Time time,bool create_if_necessary)529 TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
530                                                 bool create_if_necessary) {
531   return GetDB(TimeToID(time), create_if_necessary);
532 }
533 
ScheduleFlushOldChanges()534 void TextDatabaseManager::ScheduleFlushOldChanges() {
535   factory_.RevokeAll();
536   MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod(
537           &TextDatabaseManager::FlushOldChanges),
538       kExpirationSec * Time::kMillisecondsPerSecond);
539 }
540 
FlushOldChanges()541 void TextDatabaseManager::FlushOldChanges() {
542   FlushOldChangesForTime(TimeTicks::Now());
543 }
544 
FlushOldChangesForTime(TimeTicks now)545 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
546   // The end of the list is the oldest, so we just start from there committing
547   // things until we get something too new.
548   RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
549   while (i != recent_changes_.rend() && i->second.Expired(now)) {
550     AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
551                 i->second.visit_time(), i->second.title(), i->second.body());
552     i = recent_changes_.Erase(i);
553   }
554 
555   ScheduleFlushOldChanges();
556 }
557 
558 }  // namespace history
559