1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 7 #pragma once 8 9 #include <functional> 10 #include <map> 11 #include <set> 12 #include <string> 13 #include <vector> 14 15 #include "app/sql/connection.h" 16 #include "base/basictypes.h" 17 #include "base/file_path.h" 18 #include "base/gtest_prod_util.h" 19 #include "base/memory/linked_ptr.h" 20 #include "base/memory/scoped_ptr.h" 21 #include "base/string16.h" 22 #include "chrome/browser/autocomplete/autocomplete_match.h" 23 #include "chrome/browser/autocomplete/history_provider_util.h" 24 #include "chrome/browser/history/history_types.h" 25 #include "chrome/browser/history/in_memory_url_index_cache.pb.h" 26 #include "testing/gtest/include/gtest/gtest_prod.h" 27 28 class Profile; 29 30 namespace base { 31 class Time; 32 } 33 34 namespace in_memory_url_index { 35 class InMemoryURLIndexCacheItem; 36 } 37 38 namespace history { 39 40 namespace imui = in_memory_url_index; 41 42 class URLDatabase; 43 44 // Specifies where an omnibox term occurs within a string. Used for specifying 45 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in 46 // scoring a result. 47 struct TermMatch { TermMatchTermMatch48 TermMatch(int term_num, size_t offset, size_t length) 49 : term_num(term_num), 50 offset(offset), 51 length(length) {} 52 53 int term_num; // The index of the term in the original search string. 54 size_t offset; // The starting offset of the substring match. 55 size_t length; // The length of the substring match. 56 }; 57 typedef std::vector<TermMatch> TermMatches; 58 59 // Used for intermediate history result operations. 60 struct ScoredHistoryMatch : public HistoryMatch { 61 ScoredHistoryMatch(); // Required by STL. 62 explicit ScoredHistoryMatch(const URLRow& url_info); 63 ~ScoredHistoryMatch(); 64 65 static bool MatchScoreGreater(const ScoredHistoryMatch& m1, 66 const ScoredHistoryMatch& m2); 67 68 // An interim score taking into consideration location and completeness 69 // of the match. 70 int raw_score; 71 TermMatches url_matches; // Term matches within the URL. 72 TermMatches title_matches; // Term matches within the page title. 73 size_t prefix_adjust; // The length of a prefix which should be ignored. 74 }; 75 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches; 76 77 // The URL history source. 78 // Holds portions of the URL database in memory in an indexed form. Used to 79 // quickly look up matching URLs for a given query string. Used by 80 // the HistoryURLProvider for inline autocomplete and to provide URL 81 // matches to the omnibox. 82 // 83 // Note about multi-byte codepoints and the data structures in the 84 // InMemoryURLIndex class: One will quickly notice that no effort is made to 85 // insure that multi-byte character boundaries are detected when indexing the 86 // words and characters in the URL history database except when converting 87 // URL strings to lowercase. Multi-byte-edness makes no difference when 88 // indexing or when searching the index as the final filtering of results 89 // is dependent on the comparison of a string of bytes, not individual 90 // characters. While the lookup of those bytes during a search in the 91 // |char_word_map_| could serve up words in which the individual char16 92 // occurs as a portion of a composite character the next filtering step 93 // will eliminate such words except in the case where a single character 94 // is being searched on and which character occurs as the second char16 of a 95 // multi-char16 instance. 96 class InMemoryURLIndex { 97 public: 98 // |history_dir| is a path to the directory containing the history database 99 // within the profile wherein the cache and transaction journals will be 100 // stored. 101 explicit InMemoryURLIndex(const FilePath& history_dir); 102 ~InMemoryURLIndex(); 103 104 // Convenience types 105 typedef std::vector<string16> String16Vector; 106 107 // Opens and indexes the URL history database. 108 // |languages| gives a list of language encodings with which the history 109 // URLs and omnibox searches are interpreted, i.e. when each is broken 110 // down into words and each word is broken down into characters. 111 bool Init(URLDatabase* history_db, const std::string& languages); 112 113 // Reloads the history index. Attempts to reload from the cache unless 114 // |clear_cache| is true. If the cache is unavailable then reload the 115 // index from |history_db|. 116 bool ReloadFromHistory(URLDatabase* history_db, bool clear_cache); 117 118 // Signals that any outstanding initialization should be canceled and 119 // flushes the cache to disk. 120 void ShutDown(); 121 122 // Restores the index's private data from the cache file stored in the 123 // profile directory and returns true if successful. 124 bool RestoreFromCacheFile(); 125 126 // Caches the index private data and writes the cache file to the profile 127 // directory. 128 bool SaveToCacheFile(); 129 130 // Given a vector containing one or more words as string16s, scans the 131 // history index and return a vector with all scored, matching history items. 132 // Each term must occur somewhere in the history item's URL or page title for 133 // the item to qualify; however, the terms do not necessarily have to be 134 // adjacent. Results are sorted with higher scoring items first. Each term 135 // from |terms| may contain punctuation but should not contain spaces. 136 // A search request which results in more than |kItemsToScoreLimit| total 137 // candidate items returns no matches (though the results set will be 138 // retained and used for subsequent calls to this function) as the scoring 139 // of such a large number of candidates may cause perceptible typing response 140 // delays in the omnibox. This is likely to occur for short omnibox terms 141 // such as 'h' and 'w' which will be found in nearly all history candidates. 142 ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms); 143 144 // Updates or adds an history item to the index if it meets the minimum 145 // 'quick' criteria. 146 void UpdateURL(URLID row_id, const URLRow& row); 147 148 // Deletes indexing data for an history item. The item may not have actually 149 // been indexed (which is the case if it did not previously meet minimum 150 // 'quick' criteria). 151 void DeleteURL(URLID row_id); 152 153 // Breaks the |uni_string| string down into individual words and return 154 // a vector with the individual words in their original order. If 155 // |break_on_space| is false then the resulting list will contain only words 156 // containing alpha-numeric characters. If |break_on_space| is true then the 157 // resulting list will contain strings broken at whitespace. 158 // 159 // Example: 160 // Given: |uni_string|: "http://www.google.com/ harry the rabbit." 161 // With |break_on_space| false the returned list will contain: 162 // "http", "www", "google", "com", "harry", "the", "rabbit" 163 // With |break_on_space| true the returned list will contain: 164 // "http://", "www.google.com/", "harry", "the", "rabbit." 165 static String16Vector WordVectorFromString16(const string16& uni_string, 166 bool break_on_space); 167 168 // Extract and return the offsets from |matches|. 169 static std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches); 170 171 // Replace the offsets in |matches| with those given in |offsets|, deleting 172 // any which are npos, and return the updated list of matches. 173 static TermMatches ReplaceOffsetsInTermMatches( 174 const TermMatches& matches, 175 const std::vector<size_t>& offsets); 176 177 private: 178 friend class AddHistoryMatch; 179 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization); 180 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath); 181 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); 182 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities); 183 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring); 184 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions); 185 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch); 186 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching); 187 188 // Signals that there are no previously cached results for the typed term. 189 static const size_t kNoCachedResultForTerm; 190 191 // Creating one of me without a history path is not allowed (tests excepted). 192 InMemoryURLIndex(); 193 194 // Convenience types. 195 typedef std::set<string16> String16Set; 196 typedef std::set<char16> Char16Set; 197 typedef std::vector<char16> Char16Vector; 198 199 // An index into list of all of the words we have indexed. 200 typedef int WordID; 201 202 // A map allowing a WordID to be determined given a word. 203 typedef std::map<string16, WordID> WordMap; 204 205 // A map from character to word_ids. 206 typedef std::set<WordID> WordIDSet; // An index into the WordList. 207 typedef std::map<char16, WordIDSet> CharWordIDMap; 208 209 // A map from word_id to history item. 210 // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit. 211 // Consider using a smaller type. 212 typedef URLID HistoryID; 213 typedef std::set<HistoryID> HistoryIDSet; 214 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; 215 216 // Support caching of term character results so that we can optimize 217 // searches which build upon a previous search. Each entry in this vector 218 // represents a progressive reduction of the result set for each unique 219 // character found in the search term, with each character being taken as 220 // initially encountered. For example, once the results for the search 221 // term of "mextexarkana" have been fully determined, this vector will 222 // contain one entry for the characters: m, e, x, t, a, r, k, & n, in 223 // that order. The result sets will naturally shrink in size for each 224 // subsequent character as the sets intersections are taken in an 225 // incremental manner. 226 struct TermCharWordSet; 227 typedef std::vector<TermCharWordSet> TermCharWordSetVector; 228 229 // TODO(rohitrao): Probably replace this with QueryResults. 230 typedef std::vector<URLRow> URLRowVector; 231 232 // A map from history_id to the history's URL and title. 233 typedef std::map<HistoryID, URLRow> HistoryInfoMap; 234 235 // A helper class which performs the final filter on each candidate 236 // history URL match, inserting accepted matches into |scored_matches_| 237 // and trimming the maximum number of matches to 10. 238 class AddHistoryMatch : public std::unary_function<HistoryID, void> { 239 public: 240 AddHistoryMatch(const InMemoryURLIndex& index, 241 const String16Vector& lower_terms); 242 ~AddHistoryMatch(); 243 244 void operator()(const HistoryID history_id); 245 ScoredMatches()246 ScoredHistoryMatches ScoredMatches() const { return scored_matches_; } 247 248 private: 249 const InMemoryURLIndex& index_; 250 ScoredHistoryMatches scored_matches_; 251 const String16Vector& lower_terms_; 252 }; 253 254 // Initializes all index data members in preparation for restoring the index 255 // from the cache or a complete rebuild from the history database. 256 void ClearPrivateData(); 257 258 // Breaks a string down into individual words. 259 static String16Set WordSetFromString16(const string16& uni_string); 260 261 // Given a vector of Char16s, representing the characters the user has typed 262 // into the omnibox, finds words containing those characters. If any 263 // existing, cached set is a proper subset then starts with that cached 264 // set. Updates the previously-typed-character cache. 265 WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars); 266 267 // Given a vector of Char16s in |uni_chars|, compare those characters, in 268 // order, with the previously searched term, returning the index of the 269 // cached results in the term_char_word_set_cache_ of the set with best 270 // matching number of characters. Returns kNoCachedResultForTerm if there 271 // was no match at all (i.e. the first character of |uni-chars| is not the 272 // same as the character of the first entry in term_char_word_set_cache_). 273 size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars); 274 275 // Creates a TermMatches which has an entry for each occurrence of the string 276 // |term| found in the string |string|. Mark each match with |term_num| so 277 // that the resulting TermMatches can be merged with other TermMatches for 278 // other terms. 279 static TermMatches MatchTermInString(const string16& term, 280 const string16& string, 281 int term_num); 282 283 // URL History indexing support functions. 284 285 // Indexes one URL history item. 286 bool IndexRow(const URLRow& row); 287 288 // Breaks a string down into unique, individual characters in the order 289 // in which the characters are first encountered in the |uni_word| string. 290 static Char16Vector Char16VectorFromString16(const string16& uni_word); 291 292 // Breaks the |uni_word| string down into its individual characters. 293 // Note that this is temporarily intended to work on a single word, but 294 // _will_ work on a string of words, perhaps with unexpected results. 295 // TODO(mrossetti): Lots of optimizations possible here for not restarting 296 // a search if the user is just typing along. Also, change this to uniString 297 // and properly handle substring matches, scoring and sorting the results 298 // by score. Also, provide the metrics for where the matches occur so that 299 // the UI can highlight the matched sections. 300 static Char16Set Char16SetFromString16(const string16& uni_word); 301 302 // Given a single word in |uni_word|, adds a reference for the containing 303 // history item identified by |history_id| to the index. 304 void AddWordToIndex(const string16& uni_word, HistoryID history_id); 305 306 // Updates an existing entry in the word/history index by adding the 307 // |history_id| to set for |word_id| in the word_id_history_map_. 308 void UpdateWordHistory(WordID word_id, HistoryID history_id); 309 310 // Creates a new entry in the word/history map for |word_id| and add 311 // |history_id| as the initial element of the word's set. 312 void AddWordHistory(const string16& uni_word, HistoryID history_id); 313 314 // Clears the search term cache. This cache holds on to the intermediate 315 // word results for each previously typed character to eliminate the need 316 // to re-perform set operations for previously typed characters. 317 void ResetTermCharWordSetCache(); 318 319 // Composes a set of history item IDs by intersecting the set for each word 320 // in |uni_string|. 321 HistoryIDSet HistoryIDSetFromWords(const string16& uni_string); 322 323 // Helper function to HistoryIDSetFromWords which composes a set of history 324 // ids for the given term given in |uni_word|. 325 HistoryIDSet HistoryIDsForTerm(const string16& uni_word); 326 327 // Calculates a raw score for this history item by first determining 328 // if all of the terms in |terms_vector| occur in |row| and, if so, 329 // calculating a raw score based on 1) starting position of each term 330 // in the user input, 2) completeness of each term's match, 3) ordering 331 // of the occurrence of each term (i.e. they appear in order), 4) last 332 // visit time, and 5) number of visits. 333 // This raw score allows the results to be ordered and can be used 334 // to influence the final score calculated by the client of this 335 // index. Returns a ScoredHistoryMatch structure with the raw score and 336 // substring matching metrics. 337 static ScoredHistoryMatch ScoredMatchForURL( 338 const URLRow& row, 339 const String16Vector& terms_vector); 340 341 // Calculates a component score based on position, ordering and total 342 // substring match size using metrics recorded in |matches|. |max_length| 343 // is the length of the string against which the terms are being searched. 344 static int ScoreComponentForMatches(const TermMatches& matches, 345 size_t max_length); 346 347 // Sorts and removes overlapping substring matches from |matches| and 348 // returns the cleaned up matches. 349 static TermMatches SortAndDeoverlap(const TermMatches& matches); 350 351 // Utility functions supporting RestoreFromCache and SaveToCache. 352 353 // Construct a file path for the cache file within the same directory where 354 // the history database is kept and saves that path to |file_path|. Returns 355 // true if |file_path| can be successfully constructed. (This function 356 // provided as a hook for unit testing.) 357 bool GetCacheFilePath(FilePath* file_path); 358 359 // Encode a data structure into the protobuf |cache|. 360 void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const; 361 void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const; 362 void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const; 363 void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const; 364 void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const; 365 void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const; 366 367 // Decode a data structure from the protobuf |cache|. Return false if there 368 // is any kind of failure. 369 bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache); 370 bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache); 371 bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache); 372 bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache); 373 bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache); 374 bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache); 375 376 // Directory where cache file resides. This is, except when unit testing, 377 // the same directory in which the profile's history database is found. It 378 // should never be empty. 379 FilePath history_dir_; 380 381 // The timestamp of when the cache was last saved. This is used to validate 382 // the transaction journal's applicability to the cache. The timestamp is 383 // initialized to the NULL time, indicating that the cache was not used with 384 // the InMemoryURLIndex was last populated. 385 base::Time last_saved_; 386 387 // A list of all of indexed words. The index of a word in this list is the 388 // ID of the word in the word_map_. It reduces the memory overhead by 389 // replacing a potentially long and repeated string with a simple index. 390 // NOTE: A word will _never_ be removed from this vector thus insuring 391 // the immutability of the word_id throughout the session, reducing 392 // maintenance complexity. 393 // TODO(mrossetti): Profile the vector allocation and determine if judicious 394 // 'reserve' calls are called for. 395 String16Vector word_list_; 396 397 int history_item_count_; 398 WordMap word_map_; 399 CharWordIDMap char_word_map_; 400 WordIDHistoryMap word_id_history_map_; 401 TermCharWordSetVector term_char_word_set_cache_; 402 HistoryInfoMap history_info_map_; 403 std::string languages_; 404 405 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); 406 }; 407 408 } // namespace history 409 410 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 411