1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_ 16 #define ICING_INDEX_MAIN_MAIN_INDEX_H_ 17 18 #include <memory> 19 20 #include "icing/text_classifier/lib3/utils/base/status.h" 21 #include "icing/text_classifier/lib3/utils/base/statusor.h" 22 #include "icing/file/filesystem.h" 23 #include "icing/file/posting_list/flash-index-storage.h" 24 #include "icing/index/lite/term-id-hit-pair.h" 25 #include "icing/index/main/posting-list-hit-accessor.h" 26 #include "icing/index/main/posting-list-hit-serializer.h" 27 #include "icing/index/term-id-codec.h" 28 #include "icing/index/term-metadata.h" 29 #include "icing/legacy/index/icing-dynamic-trie.h" 30 #include "icing/legacy/index/icing-filesystem.h" 31 #include "icing/proto/debug.pb.h" 32 #include "icing/proto/scoring.pb.h" 33 #include "icing/proto/storage.pb.h" 34 #include "icing/proto/term.pb.h" 35 #include "icing/store/namespace-id.h" 36 #include "icing/store/suggestion-result-checker.h" 37 #include "icing/util/status-macros.h" 38 39 namespace icing { 40 namespace lib { 41 42 class MainIndex { 43 public: 44 // RETURNS: 45 // - valid instance of MainIndex, on success. 46 // - INTERNAL error if unable to create the lexicon or flash storage. 47 static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create( 48 const std::string& index_directory, const Filesystem* filesystem, 49 const IcingFilesystem* icing_filesystem); 50 51 // Reads magic from existing flash index storage file header. We need this 52 // during Icing initialization phase to determine the version. 53 // 54 // RETURNS: 55 // - On success, a valid magic. 56 // - NOT_FOUND if the flash index doesn't exist. 57 // - INTERNAL on I/O error. 58 static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic( 59 const Filesystem* filesystem, const std::string& index_directory); 60 61 // Get a PostingListHitAccessor that holds the posting list chain for 'term'. 62 // 63 // RETURNS: 64 // - On success, a valid PostingListHitAccessor 65 // - NOT_FOUND if term is not present in the main index. 66 libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>> 67 GetAccessorForExactTerm(const std::string& term); 68 69 // Get a PostingListHitAccessor for 'prefix'. 70 // 71 // RETURNS: 72 // - On success, a result containing a valid PostingListHitAccessor. 73 // - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a 74 // prefix are present in the main index. 75 struct GetPrefixAccessorResult { 76 // A PostingListHitAccessor that holds the posting list chain for the term 77 // that best represents 'prefix' in the main index. 78 std::unique_ptr<PostingListHitAccessor> accessor; 79 // True if the returned posting list chain is for 'prefix' or false if the 80 // returned posting list chain is for a term for which 'prefix' is a prefix. 81 bool exact; 82 GetPrefixAccessorResultGetPrefixAccessorResult83 explicit GetPrefixAccessorResult( 84 std::unique_ptr<PostingListHitAccessor> accessor_in, bool exact_in) 85 : accessor(std::move(accessor_in)), exact(exact_in) {} 86 }; 87 libtextclassifier3::StatusOr<GetPrefixAccessorResult> 88 GetAccessorForPrefixTerm(const std::string& prefix); 89 90 // Finds terms with the given prefix in the given result checker. The 91 // input prefix must be normalized, otherwise inaccurate results may be 92 // returned. If scoring_match_type is EXACT, only exact hit will be counted 93 // and it is PREFIX, both prefix and exact hits will be counted. Results are 94 // not sorted specifically and are in lexigraphical order. Number of results 95 // are no more than 'num_to_return'. 96 // 97 // Returns: 98 // A list of TermMetadata on success 99 // INTERNAL_ERROR if failed to access term data. 100 libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( 101 const std::string& prefix, TermMatchType::Code scoring_match_type, 102 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 103 const SuggestionResultChecker* suggestion_result_checker); 104 105 struct LexiconMergeOutputs { 106 // Maps from main_lexicon tvi for new branching point to the main_lexicon 107 // tvi for posting list whose hits must be backfilled. 108 std::unordered_map<uint32_t, uint32_t> backfill_map; 109 110 // Maps from lexicon tvis to main_lexicon tvis. 111 std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi; 112 113 // Maps from main lexicon tvi to the block index. Tvis with no entry do not 114 // have an allocated posting list. 115 std::unordered_map<uint32_t, int> main_tvi_to_block_index; 116 117 // Maps from the lexicon tvi to the beginning position in 118 // prefix_tvis_buf and the length. 119 std::unordered_map<uint32_t, std::pair<int, int>> 120 other_tvi_to_prefix_main_tvis; 121 122 // Stores tvis that are mapped to by other_tvi_to_prefix_tvis. 123 std::vector<uint32_t> prefix_tvis_buf; 124 }; 125 126 // Merge the lexicon into the main lexicon and populate the data 127 // structures necessary to translate lite tvis to main tvis, track backfilling 128 // and expanding lite terms to prefix terms. 129 // 130 // RETURNS: 131 // - OK on success 132 // - INTERNAL on IO error while writing to the main lexicon. MergeLexicon(const IcingDynamicTrie & other_lexicon)133 libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon( 134 const IcingDynamicTrie& other_lexicon) { 135 // Backfill branch points need to be added first so that the backfill_map 136 // can be correctly populated. 137 ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs, 138 AddBackfillBranchPoints(other_lexicon)); 139 ICING_ASSIGN_OR_RETURN(outputs, 140 AddTerms(other_lexicon, std::move(outputs))); 141 // Non-backfill branch points need to be added last so that the mapping of 142 // newly added terms to prefix terms can be correctly populated (prefix 143 // terms might be branch points between two new terms or between a 144 // pre-existing term and a new term). 145 ICING_ASSIGN_OR_RETURN(outputs, 146 AddBranchPoints(other_lexicon, std::move(outputs))); 147 return outputs; 148 } 149 150 // Add hits to the main index and backfill from existing posting lists to new 151 // backfill branch points. 152 // 153 // The backfill_map maps from main_lexicon tvi for a newly added branching 154 // point to the main_lexicon tvi for the posting list whose hits must be 155 // backfilled. backfill_map should be populated as part of LexiconMergeOutputs 156 // in MergeLexicon and be blindly passed to this function. 157 // 158 // RETURNS: 159 // - OK on success 160 // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id 161 // exceeds the max TermId, is not valid or is not less than pre-existing hits 162 // in the main index. 163 // - INTERNAL_ERROR if unable to mmap necessary IndexBlocks 164 // - RESOURCE_EXHAUSTED error if unable to grow the index 165 libtextclassifier3::Status AddHits( 166 const TermIdCodec& term_id_codec, 167 std::unordered_map<uint32_t, uint32_t>&& backfill_map, 168 std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id); 169 PersistToDisk()170 libtextclassifier3::Status PersistToDisk() { 171 if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) { 172 return libtextclassifier3::Status::OK; 173 } 174 return absl_ports::InternalError("Unable to sync main index components."); 175 } 176 last_added_document_id()177 DocumentId last_added_document_id() const { 178 return flash_index_storage_->get_last_indexed_docid(); 179 } 180 Reset()181 libtextclassifier3::Status Reset() { 182 ICING_RETURN_IF_ERROR(flash_index_storage_->Reset()); 183 main_lexicon_->Clear(); 184 return libtextclassifier3::Status::OK; 185 } 186 Warm()187 void Warm() { main_lexicon_->Warm(); } 188 189 // Returns: 190 // - elements size of lexicon and index, on success 191 // - INTERNAL on IO error 192 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; 193 194 // Takes the provided storage_info, populates the fields related to the main 195 // index and returns that storage_info. 196 // 197 // If an IO error occurs while trying to calculate the value for a field, then 198 // that field will be set to -1. 199 IndexStorageInfoProto GetStorageInfo( 200 IndexStorageInfoProto storage_info) const; 201 202 // Returns debug information for the main index in out. 203 // verbosity = BASIC, simplest debug information - just the lexicon 204 // verbosity = DETAILED, more detailed debug information including raw 205 // postings lists. 206 std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const; 207 208 // Reduces internal file sizes by reclaiming space of deleted documents. 209 // 210 // This method will update the last_added_docid of the index to the largest 211 // document id that still appears in the index after compaction. 212 // 213 // Returns: 214 // OK on success 215 // INTERNAL_ERROR on IO error, this indicates that the index may be in an 216 // invalid state and should be cleared. 217 libtextclassifier3::Status Optimize( 218 const std::vector<DocumentId>& document_id_old_to_new); 219 220 private: 221 explicit MainIndex(const std::string& index_directory, 222 const Filesystem* filesystem, 223 const IcingFilesystem* icing_filesystem); 224 225 libtextclassifier3::Status Init(); 226 227 // Helpers for merging the lexicon 228 // Add all 'backfill' branch points. Backfill branch points are prefix 229 // branch points that are a prefix of terms that existed in the lexicon 230 // to the merge. 231 // 232 // For example, if the main lexicon only contains "foot" and is then merged 233 // with a lite lexicon containing only "fool", then a backfill branch point 234 // for "foo" will be added to contain prefix hits from both the pre-existing 235 // posting list for "foot" and the new posting list for "fool". 236 // 237 // Populates LexiconMergeOutputs.backfill_map 238 // 239 // RETURNS: 240 // - OK on success 241 // - INTERNAL on IO error while writing to the main lexicon. 242 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints( 243 const IcingDynamicTrie& other_lexicon); 244 245 // Add all terms from the lexicon. 246 // 247 // Populates LexiconMergeOutputs.other_tvi_to_main_tvi 248 // 249 // RETURNS: 250 // - OK on success 251 // - INTERNAL on IO error while writing to the main lexicon. 252 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms( 253 const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); 254 255 // Add all branch points for terms added from the lexicon. 256 // For example, if the main lexicon is empty and is then merged with a 257 // lexicon containing only "foot" and "fool", then a branch point for "foo" 258 // will be added to contain prefix hits from both "foot" and "fool". 259 // 260 // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and 261 // LexiconMergeOutputs.prefix_tvis_buf; 262 // 263 // RETURNS: 264 // - OK on success 265 // - INTERNAL on IO error while writing to the main lexicon. 266 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints( 267 const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); 268 269 // Copies all properties from old_tvi in the other lexicon to the new_tvi in 270 // the main lexicon. 271 // Returns true on success, false if an IO error is encountered. 272 bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader, 273 const IcingDynamicTrie& other_lexicon, uint32_t other_tvi, 274 uint32_t new_main_tvi); 275 276 // Add all hits between [hit_elements, hit_elements + len) to main_index, 277 // updating the entry in the main lexicon at trie_value_index to point to the 278 // resulting posting list. Hits are sorted in descending document id order, so 279 // they should be to posting lists in reverse (starting at hit_elements 280 // + len - 1) and working backwards. Therefore, hit_elements must be in sorted 281 // order. 282 // 283 // trie_value_index may point to a valid posting list id if there is a 284 // pre-existing posting list to append to. 285 // 286 // If backfill_posting_list_id is valid, then the hits from the posting list 287 // identified by backfill_posting_list_id should be added to the new posting 288 // list before the hits in hit_elements. 289 // 290 // RETURNS: 291 // - OK on success 292 // - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid 293 // but points out of bounds in the IndexBlock referred to by 294 // id.block_index(), if one of the hits from [hit_elements,hit_elements+len) 295 // is not valid, or if one of the hits from [hit_elements,hit_elements+len) 296 // is not less than the previously added hits. 297 // - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid 298 // but points to an invalid block index or if unable to mmap the IndexBlock. 299 // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new 300 // posting list. 301 libtextclassifier3::Status AddHitsForTerm( 302 uint32_t tvi, PostingListIdentifier backfill_posting_list_id, 303 const TermIdHitPair* hit_elements, size_t len); 304 305 // Adds all prefix hits or hits from prefix sections present on the posting 306 // list identified by backfill_posting_list_id to hit_accum. 307 // 308 // RETURNS: 309 // - OK, on success 310 // - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the 311 // IndexBlock referred to by id.block_index() 312 // - INTERNAL_ERROR if unable to mmap the block identified by 313 // backfill_posting_list_id or if the posting list identified by 314 // backfill_posting_list_id has been corrupted. 315 // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new 316 // posting list. 317 libtextclassifier3::Status AddPrefixBackfillHits( 318 PostingListIdentifier backfill_posting_list_id, 319 PostingListHitAccessor* hit_accum); 320 321 // Transfer hits from old_pl_accessor to new_index for term. 322 // 323 // Returns: 324 // largest document id added to the translated posting list, on success 325 // INTERNAL_ERROR on IO error 326 static libtextclassifier3::StatusOr<DocumentId> TransferAndAddHits( 327 const std::vector<DocumentId>& document_id_old_to_new, const char* term, 328 PostingListHitAccessor& old_pl_accessor, MainIndex* new_index); 329 330 // Transfer hits from the current main index to new_index. 331 // 332 // Returns: 333 // OK on success 334 // INTERNAL_ERROR on IO error 335 libtextclassifier3::Status TransferIndex( 336 const std::vector<DocumentId>& document_id_old_to_new, 337 MainIndex* new_index); 338 339 std::string base_dir_; 340 const Filesystem* filesystem_; 341 const IcingFilesystem* icing_filesystem_; 342 std::unique_ptr<PostingListHitSerializer> posting_list_hit_serializer_; 343 std::unique_ptr<FlashIndexStorage> flash_index_storage_; 344 std::unique_ptr<IcingDynamicTrie> main_lexicon_; 345 }; 346 347 } // namespace lib 348 } // namespace icing 349 350 #endif // ICING_INDEX_MAIN_MAIN_INDEX_H_ 351