1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_INDEX_H_ 16 #define ICING_INDEX_INDEX_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <unordered_map> 22 #include <utility> 23 #include <vector> 24 25 #include "icing/text_classifier/lib3/utils/base/status.h" 26 #include "icing/text_classifier/lib3/utils/base/statusor.h" 27 #include "icing/file/filesystem.h" 28 #include "icing/index/hit/hit.h" 29 #include "icing/index/iterator/doc-hit-info-iterator.h" 30 #include "icing/index/lite/lite-index.h" 31 #include "icing/index/lite/term-id-hit-pair.h" 32 #include "icing/index/main/main-index-merger.h" 33 #include "icing/index/main/main-index.h" 34 #include "icing/index/term-id-codec.h" 35 #include "icing/index/term-metadata.h" 36 #include "icing/legacy/index/icing-filesystem.h" 37 #include "icing/proto/debug.pb.h" 38 #include "icing/proto/logging.pb.h" 39 #include "icing/proto/scoring.pb.h" 40 #include "icing/proto/storage.pb.h" 41 #include "icing/proto/term.pb.h" 42 #include "icing/schema/section.h" 43 #include "icing/store/document-id.h" 44 #include "icing/store/namespace-id.h" 45 #include "icing/store/suggestion-result-checker.h" 46 #include "icing/util/status-macros.h" 47 48 namespace icing { 49 namespace lib { 50 51 // The class representing the Icing search index. This index maps terms to hits 52 // (document_ids, section_ids). 53 // Content is added to the index through the Editor class - which also dedupes 54 // hits (calling Editor::AddHit with the same arguments will only result in the 55 // creation of a single hit). 56 // Ex. 57 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index, 58 // . Index::Create(MakeIndexOptions())); 59 // Index::Editor editor = index->Edit(document_id, section_id, 60 // TermMatchType::EXACT_ONLY); ICING_RETURN_IF_ERROR(editor.AddHit("foo")); 61 // ICING_RETURN_IF_ERROR(editor.AddHit("baz")); 62 // 63 // Content is retrieved from the index through the Iterator class. 64 // Ex. 65 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index, 66 // . Index::Create(MakeIndexOptions())); 67 // ICING_ASSIGN_OR_RETURN(Index::Iterator iterator = 68 // index->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); 69 // while(iterator->Advance().ok()) 70 // ProcessResult(iterator->value()); 71 class Index { 72 public: 73 struct Options { OptionsOptions74 explicit Options(const std::string& base_dir, uint32_t index_merge_size, 75 bool lite_index_sort_at_indexing, 76 uint32_t lite_index_sort_size) 77 : base_dir(base_dir), 78 index_merge_size(index_merge_size), 79 lite_index_sort_at_indexing(lite_index_sort_at_indexing), 80 lite_index_sort_size(lite_index_sort_size) {} 81 82 std::string base_dir; 83 int32_t index_merge_size; 84 bool lite_index_sort_at_indexing; 85 int32_t lite_index_sort_size; 86 }; 87 88 // Creates an instance of Index in the directory pointed by file_dir. 89 // 90 // Returns: 91 // Valid Index on success 92 // DATA_LOSS if the index was corrupt and had to be cleared 93 // INVALID_ARGUMENT if options have invalid values 94 // INTERNAL on I/O error 95 static libtextclassifier3::StatusOr<std::unique_ptr<Index>> Create( 96 const Options& options, const Filesystem* filesystem, 97 const IcingFilesystem* icing_filesystem); 98 99 // Reads magic from existing flash (main) index file header. We need this 100 // during Icing initialization phase to determine the version. 101 // 102 // Returns 103 // Valid magic on success 104 // NOT_FOUND if the lite index doesn't exist 105 // INTERNAL on I/O error 106 static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic( 107 const Filesystem* filesystem, const std::string& base_dir); 108 109 // Clears all files created by the index. Returns OK if all files were 110 // cleared. Reset()111 libtextclassifier3::Status Reset() { 112 ICING_RETURN_IF_ERROR(lite_index_->Reset()); 113 return main_index_->Reset(); 114 } 115 116 // Brings components of the index into memory in anticipation of a query in 117 // order to reduce latency. Warm()118 void Warm() { 119 lite_index_->Warm(); 120 main_index_->Warm(); 121 } 122 123 // Syncs all the data and metadata changes to disk. 124 // 125 // Returns: 126 // OK on success 127 // INTERNAL on I/O errors PersistToDisk()128 libtextclassifier3::Status PersistToDisk() { 129 ICING_RETURN_IF_ERROR(lite_index_->PersistToDisk()); 130 return main_index_->PersistToDisk(); 131 } 132 133 // Discard parts of the index if they contain data for document ids greater 134 // than document_id. 135 // 136 // NOTE: This means that TruncateTo(kInvalidDocumentId) will have no effect. 137 // 138 // Returns: 139 // OK on success 140 // INTERNAL on I/O errors 141 libtextclassifier3::Status TruncateTo(DocumentId document_id); 142 143 // DocumentIds are always inserted in increasing order. Returns the largest 144 // document_id added to the index. last_added_document_id()145 DocumentId last_added_document_id() const { 146 DocumentId lite_document_id = lite_index_->last_added_document_id(); 147 if (lite_document_id != kInvalidDocumentId) { 148 return lite_document_id; 149 } 150 return main_index_->last_added_document_id(); 151 } 152 153 // Sets last_added_document_id to document_id so long as document_id > 154 // last_added_document_id() set_last_added_document_id(DocumentId document_id)155 void set_last_added_document_id(DocumentId document_id) { 156 DocumentId lite_document_id = lite_index_->last_added_document_id(); 157 if (lite_document_id == kInvalidDocumentId || 158 document_id >= lite_document_id) { 159 lite_index_->set_last_added_document_id(document_id); 160 } 161 } 162 163 // Returns debug information for the index in out. 164 // verbosity = BASIC, simplest debug information - just the lexicons and lite 165 // index. 166 // verbosity = DETAILED, more detailed debug information including raw 167 // postings lists. GetDebugInfo(DebugInfoVerbosity::Code verbosity)168 IndexDebugInfoProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) const { 169 IndexDebugInfoProto debug_info; 170 *debug_info.mutable_index_storage_info() = GetStorageInfo(); 171 *debug_info.mutable_lite_index_info() = 172 lite_index_->GetDebugInfo(verbosity); 173 *debug_info.mutable_main_index_info() = 174 main_index_->GetDebugInfo(verbosity); 175 return debug_info; 176 } 177 PublishQueryStats(QueryStatsProto * query_stats)178 void PublishQueryStats(QueryStatsProto* query_stats) const { 179 query_stats->set_lite_index_hit_buffer_byte_size( 180 lite_index_->GetHitBufferByteSize()); 181 query_stats->set_lite_index_hit_buffer_unsorted_byte_size( 182 lite_index_->GetHitBufferUnsortedByteSize()); 183 } 184 185 // Returns the byte size of the all the elements held in the index. This 186 // excludes the size of any internal metadata of the index, e.g. the index's 187 // header. 188 // 189 // Returns: 190 // Byte size on success 191 // INTERNAL_ERROR on IO error GetElementsSize()192 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const { 193 ICING_ASSIGN_OR_RETURN(int64_t lite_index_size, 194 lite_index_->GetElementsSize()); 195 ICING_ASSIGN_OR_RETURN(int64_t main_index_size, 196 main_index_->GetElementsSize()); 197 return lite_index_size + main_index_size; 198 } 199 200 // Calculates the StorageInfo for the Index. 201 // 202 // If an IO error occurs while trying to calculate the value for a field, then 203 // that field will be set to -1. 204 IndexStorageInfoProto GetStorageInfo() const; 205 206 // Create an iterator to iterate through all doc hit infos in the index that 207 // match the term. term_start_index is the start index of the given term in 208 // the search query. unnormalized_term_length is the length of the given 209 // unnormalized term in the search query not listed in the mask. 210 // Eg. section_id_mask = 1U << 3; would only return hits that occur in 211 // section 3. 212 // 213 // Returns: 214 // unique ptr to a valid DocHitInfoIterator that matches the term 215 // INVALID_ARGUMENT if given an invalid term_match_type 216 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( 217 const std::string& term, int term_start_index, 218 int unnormalized_term_length, SectionIdMask section_id_mask, 219 TermMatchType::Code term_match_type, bool need_hit_term_frequency = true); 220 221 // Finds terms with the given prefix in the given namespaces. If 222 // 'namespace_ids' is empty, returns results from all the namespaces. Results 223 // are sorted in decreasing order of hit count. Number of results are no more 224 // than 'num_to_return'. 225 // 226 // Returns: 227 // A list of TermMetadata on success 228 // INTERNAL_ERROR if failed to access term data. 229 libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( 230 const std::string& prefix, int num_to_return, 231 TermMatchType::Code scoring_match_type, 232 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by, 233 const SuggestionResultChecker* suggestion_result_checker); 234 235 // A class that can be used to add hits to the index. 236 // 237 // An editor groups hits from a particular section within a document together 238 // and dedupes hits for the same term within a section. This removes the 239 // burden of deduping from the caller and direct access to the index 240 // implementation allows for more efficient deduping. 241 class Editor { 242 public: 243 // Does not take any ownership, and all pointers must refer to valid objects 244 // that outlive the one constructed. 245 // TODO(b/141180665): Add nullptr checks for the raw pointers Editor(const TermIdCodec * term_id_codec,LiteIndex * lite_index,DocumentId document_id,SectionId section_id,TermMatchType::Code term_match_type,NamespaceId namespace_id)246 Editor(const TermIdCodec* term_id_codec, LiteIndex* lite_index, 247 DocumentId document_id, SectionId section_id, 248 TermMatchType::Code term_match_type, NamespaceId namespace_id) 249 : term_id_codec_(term_id_codec), 250 lite_index_(lite_index), 251 document_id_(document_id), 252 term_match_type_(term_match_type), 253 namespace_id_(namespace_id), 254 section_id_(section_id) {} 255 256 // Buffer the term in seen_tokens_. 257 libtextclassifier3::Status BufferTerm(const char* term); 258 // Index all the terms stored in seen_tokens_. 259 libtextclassifier3::Status IndexAllBufferedTerms(); 260 261 private: 262 // The Editor is able to store previously seen terms as TermIds. This is 263 // is more efficient than a client doing this externally because TermIds are 264 // not exposed to clients. 265 std::unordered_map<uint32_t, Hit::TermFrequency> seen_tokens_; 266 const TermIdCodec* term_id_codec_; 267 LiteIndex* lite_index_; 268 DocumentId document_id_; 269 TermMatchType::Code term_match_type_; 270 NamespaceId namespace_id_; 271 SectionId section_id_; 272 }; Edit(DocumentId document_id,SectionId section_id,TermMatchType::Code term_match_type,NamespaceId namespace_id)273 Editor Edit(DocumentId document_id, SectionId section_id, 274 TermMatchType::Code term_match_type, NamespaceId namespace_id) { 275 return Editor(term_id_codec_.get(), lite_index_.get(), document_id, 276 section_id, term_match_type, namespace_id); 277 } 278 WantsMerge()279 bool WantsMerge() const { return lite_index_->WantsMerge(); } 280 281 // Merges newly-added hits in the LiteIndex into the MainIndex. 282 // 283 // RETURNS: 284 // - INTERNAL on IO error while writing to the MainIndex. 285 // - RESOURCE_EXHAUSTED error if unable to grow the index. Merge()286 libtextclassifier3::Status Merge() { 287 ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs, 288 main_index_->MergeLexicon(lite_index_->lexicon())); 289 ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs, 290 MainIndexMerger::TranslateAndExpandLiteHits( 291 *lite_index_, *term_id_codec_, outputs)); 292 ICING_RETURN_IF_ERROR(main_index_->AddHits( 293 *term_id_codec_, std::move(outputs.backfill_map), 294 std::move(term_id_hit_pairs), lite_index_->last_added_document_id())); 295 ICING_RETURN_IF_ERROR(main_index_->PersistToDisk()); 296 return lite_index_->Reset(); 297 } 298 299 // Whether the LiteIndex HitBuffer requires sorting. This is only true if 300 // Icing has enabled sorting during indexing time, and the HitBuffer's 301 // unsorted tail has exceeded the lite_index_sort_size. LiteIndexNeedSort()302 bool LiteIndexNeedSort() const { 303 return options_.lite_index_sort_at_indexing && 304 lite_index_->HasUnsortedHitsExceedingSortThreshold(); 305 } 306 307 // Sorts the LiteIndex HitBuffer. SortLiteIndex()308 void SortLiteIndex() { lite_index_->SortHits(); } 309 310 // Reduces internal file sizes by reclaiming space of deleted documents. 311 // new_last_added_document_id will be used to update the last added document 312 // id in the lite index. 313 // 314 // Returns: 315 // OK on success 316 // INTERNAL_ERROR on IO error, this indicates that the index may be in an 317 // invalid state and should be cleared. 318 libtextclassifier3::Status Optimize( 319 const std::vector<DocumentId>& document_id_old_to_new, 320 DocumentId new_last_added_document_id); 321 322 private: Index(const Options & options,std::unique_ptr<TermIdCodec> term_id_codec,std::unique_ptr<LiteIndex> lite_index,std::unique_ptr<MainIndex> main_index,const Filesystem * filesystem)323 Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec, 324 std::unique_ptr<LiteIndex> lite_index, 325 std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem) 326 : lite_index_(std::move(lite_index)), 327 main_index_(std::move(main_index)), 328 options_(options), 329 term_id_codec_(std::move(term_id_codec)), 330 filesystem_(filesystem) {} 331 332 libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix( 333 const std::string& prefix, 334 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by, 335 const SuggestionResultChecker* suggestion_result_checker); 336 337 std::unique_ptr<LiteIndex> lite_index_; 338 std::unique_ptr<MainIndex> main_index_; 339 const Options options_; 340 std::unique_ptr<TermIdCodec> term_id_codec_; 341 const Filesystem* filesystem_; 342 }; 343 344 } // namespace lib 345 } // namespace icing 346 347 #endif // ICING_INDEX_INDEX_H_ 348