1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // A small index with continuous updates (doesn't need explicit Flush 16 // to persiste) but has more possibility for corruption. It can always 17 // detect corruption reliably. 18 19 #ifndef ICING_INDEX_LITE_INDEX_H_ 20 #define ICING_INDEX_LITE_INDEX_H_ 21 22 #include <cstdint> 23 #include <iterator> 24 #include <limits> 25 #include <memory> 26 #include <string> 27 #include <vector> 28 29 #include "icing/text_classifier/lib3/utils/base/status.h" 30 #include "icing/text_classifier/lib3/utils/base/statusor.h" 31 #include "icing/absl_ports/mutex.h" 32 #include "icing/absl_ports/thread_annotations.h" 33 #include "icing/file/filesystem.h" 34 #include "icing/index/hit/doc-hit-info.h" 35 #include "icing/index/hit/hit.h" 36 #include "icing/index/lite/lite-index-header.h" 37 #include "icing/index/lite/lite-index-options.h" 38 #include "icing/index/lite/term-id-hit-pair.h" 39 #include "icing/index/term-id-codec.h" 40 #include "icing/legacy/index/icing-array-storage.h" 41 #include "icing/legacy/index/icing-dynamic-trie.h" 42 #include "icing/legacy/index/icing-filesystem.h" 43 #include "icing/legacy/index/icing-mmapper.h" 44 #include "icing/proto/debug.pb.h" 45 #include "icing/proto/scoring.pb.h" 46 #include "icing/proto/storage.pb.h" 47 #include "icing/proto/term.pb.h" 48 #include "icing/schema/section.h" 49 #include "icing/store/document-id.h" 50 #include "icing/store/namespace-id.h" 51 #include "icing/store/suggestion-result-checker.h" 52 #include "icing/util/crc32.h" 53 54 namespace icing { 55 namespace lib { 56 57 // The LiteIndex is go/thread-compatible. Operations on the same data member 58 // object interfere with each other, unless they are guaranteed not to mutate 59 // the object (In the case of LiteIndex, this means all const methods, 60 // FetchHits and ScoreHits). 61 class LiteIndex { 62 public: 63 // An entry in the hit buffer. 64 using Options = LiteIndexOptions; 65 66 // Offset for the LiteIndex_Header in the hit buffer mmap. 67 static constexpr uint32_t kHeaderFileOffset = 0; 68 69 // Updates checksum of subcomponents. 70 ~LiteIndex(); 71 72 // Creates lite index from storage. The files will be created if they do not 73 // already exist. 74 // 75 // Returns: 76 // OK on success 77 // DATA_LOSS if the index was corrupted and cleared 78 // INTERNAL on I/O error 79 static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create( 80 const Options& options, const IcingFilesystem* filesystem); 81 82 // Resets all internal members of the index. Returns OK if all operations were 83 // successful. 84 libtextclassifier3::Status Reset() ICING_LOCKS_EXCLUDED(mutex_); 85 86 // Advises the OS to cache pages in the index, which will be accessed for a 87 // query soon. 88 void Warm() ICING_LOCKS_EXCLUDED(mutex_); 89 90 // Syncs all modified files in the index to disk. 91 // 92 // Returns: 93 // OK on success 94 // INTERNAL on I/O error 95 libtextclassifier3::Status PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_); 96 97 // Returns term_id if term found, NOT_FOUND otherwise. 98 libtextclassifier3::StatusOr<uint32_t> GetTermId( 99 const std::string& term) const ICING_LOCKS_EXCLUDED(mutex_); 100 101 // Returns an iterator for all terms for which 'prefix' is a prefix. 102 class PrefixIterator { 103 public: PrefixIterator(const IcingDynamicTrie::Iterator & delegate)104 explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate) 105 : delegate_(delegate) {} IsValid()106 bool IsValid() const { return delegate_.IsValid(); } 107 Advance()108 void Advance() { delegate_.Advance(); } 109 GetKey()110 const char* GetKey() const { return delegate_.GetKey(); } 111 GetValueIndex()112 uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); } 113 114 private: 115 IcingDynamicTrie::Iterator delegate_; 116 }; 117 118 // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate any 119 // previously returned PrefixIterator. FindTermPrefixes(const std::string & prefix)120 PrefixIterator FindTermPrefixes(const std::string& prefix) const 121 ICING_LOCKS_EXCLUDED(mutex_) { 122 absl_ports::shared_lock l(&mutex_); 123 return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str())); 124 } 125 126 // Inserts a term with its properties. 127 // 128 // Returns: 129 // A value index on success 130 // RESOURCE_EXHAUSTED if lexicon is full or no disk space is available 131 libtextclassifier3::StatusOr<uint32_t> InsertTerm( 132 const std::string& term, TermMatchType::Code term_match_type, 133 NamespaceId namespace_id) ICING_LOCKS_EXCLUDED(mutex_); 134 135 // Updates term properties by setting hasPrefixHits and namespace id of the 136 // term. 137 // 138 // Returns: 139 // OK on success 140 // RESOURCE_EXHAUSTED if no disk space is available 141 libtextclassifier3::Status UpdateTermProperties(uint32_t tvi, 142 bool hasPrefixHits, 143 NamespaceId namespace_id) 144 ICING_LOCKS_EXCLUDED(mutex_); 145 146 // Append hit to buffer. term_id must be encoded using the same term_id_codec 147 // supplied to the index constructor. 148 // RETURNS: 149 // - OK if hit was successfully added 150 // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer 151 // or file system capacity reached). 152 libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit) 153 ICING_LOCKS_EXCLUDED(mutex_); 154 155 // Add all hits with term_id from the sections specified in section_id_mask, 156 // skipping hits in non-prefix sections if only_from_prefix_sections is true, 157 // to hits_out. If hits_out is nullptr, no hits will be added. The 158 // corresponding hit term frequencies will also not be added if 159 // term_frequency_out is nullptr. 160 // 161 // Only those hits which belongs to the given namespaces will be counted and 162 // fetched. A nullptr namespace checker will disable this check. 163 // 164 // Returns the score of hits that would be added to hits_out according the 165 // given score_by. 166 int FetchHits( 167 uint32_t term_id, SectionIdMask section_id_mask, 168 bool only_from_prefix_sections, 169 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 170 const SuggestionResultChecker* suggestion_result_checker, 171 std::vector<DocHitInfo>* hits_out, 172 std::vector<Hit::TermFrequencyArray>* term_frequency_out = nullptr) 173 ICING_LOCKS_EXCLUDED(mutex_); 174 175 // Returns the hit count of the term. 176 // Only those hits which belongs to the given namespaces will be counted. 177 libtextclassifier3::StatusOr<int> ScoreHits( 178 uint32_t term_id, 179 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 180 const SuggestionResultChecker* suggestion_result_checker) 181 ICING_LOCKS_EXCLUDED(mutex_); 182 empty()183 bool empty() const ICING_LOCKS_EXCLUDED(mutex_) { return size() == 0; } 184 size()185 uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) { 186 absl_ports::shared_lock l(&mutex_); 187 return size_impl(); 188 } 189 WantsMerge()190 bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) { 191 absl_ports::shared_lock l(&mutex_); 192 return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes / 193 sizeof(TermIdHitPair::Value)); 194 } 195 196 // Whether or not the HitBuffer's unsorted tail size exceeds the sort 197 // threshold. HasUnsortedHitsExceedingSortThreshold()198 bool HasUnsortedHitsExceedingSortThreshold() const 199 ICING_LOCKS_EXCLUDED(mutex_) { 200 absl_ports::shared_lock l(&mutex_); 201 return HasUnsortedHitsExceedingSortThresholdImpl(); 202 } 203 204 // Sort hits stored in the index. SortHits()205 void SortHits() ICING_LOCKS_EXCLUDED(mutex_) { 206 absl_ports::unique_lock l(&mutex_); 207 SortHitsImpl(); 208 }; 209 210 class const_iterator { 211 friend class LiteIndex; 212 213 public: 214 using iterator_category = std::forward_iterator_tag; 215 using value_type = TermIdHitPair; 216 using reference = const value_type&; 217 using pointer = const value_type*; 218 const_iterator()219 const_iterator() : const_iterator(nullptr, -1, -1) {} 220 221 reference operator*() const { return start_[position_]; } 222 223 pointer operator->() const { return start_ + position_; } 224 225 const_iterator& operator++() { 226 if (++position_ >= end_position_) { 227 start_ = nullptr; 228 position_ = -1; 229 end_position_ = -1; 230 } 231 return *this; 232 } 233 234 const_iterator operator++(int) { 235 auto tmp = *this; 236 ++*this; 237 return tmp; 238 } 239 240 bool operator!=(const const_iterator& rhs) { return !(*this == rhs); } 241 242 bool operator==(const const_iterator& rhs) { 243 return start_ == rhs.start_ && position_ == rhs.position_; 244 } 245 246 private: const_iterator(const TermIdHitPair * start,int position,int end_position)247 explicit const_iterator(const TermIdHitPair* start, int position, 248 int end_position) 249 : start_(start), position_(position), end_position_(end_position) {} 250 251 const TermIdHitPair* start_; 252 int position_; 253 int end_position_; 254 }; 255 begin()256 const_iterator begin() const ICING_LOCKS_EXCLUDED(mutex_) { 257 absl_ports::shared_lock l(&mutex_); 258 // If the LiteIndex is empty, just return end(). 259 return empty_impl() 260 ? end() 261 : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0, 262 header_->cur_size()); 263 } 264 end()265 const_iterator end() const { return const_iterator(); } 266 max_hit_buffer_size()267 constexpr static uint32_t max_hit_buffer_size() { 268 return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair); 269 } 270 271 // We keep track of the last added document_id. This is always the largest 272 // document_id that has been added because hits can only be added in order of 273 // increasing document_id. last_added_document_id()274 DocumentId last_added_document_id() const ICING_LOCKS_EXCLUDED(mutex_) { 275 absl_ports::shared_lock l(&mutex_); 276 return header_->last_added_docid(); 277 } set_last_added_document_id(DocumentId document_id)278 void set_last_added_document_id(DocumentId document_id) 279 ICING_LOCKS_EXCLUDED(mutex_) { 280 absl_ports::unique_lock l(&mutex_); 281 header_->set_last_added_docid(document_id); 282 } 283 284 // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate the reference 285 // returned here. lexicon()286 const IcingDynamicTrie& lexicon() const { return lexicon_; } 287 288 // Returns debug information for the index in out. 289 // verbosity = BASIC, simplest debug information - size of lexicon, hit buffer 290 // verbosity = DETAILED, more detailed debug information from the lexicon. 291 std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) 292 ICING_LOCKS_EXCLUDED(mutex_); 293 294 // Returns the byte size of all the elements held in the index. This excludes 295 // the size of any internal metadata of the index, e.g. the index's header. 296 // 297 // Returns: 298 // Byte size on success 299 // INTERNAL_ERROR on IO error 300 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const 301 ICING_LOCKS_EXCLUDED(mutex_); 302 303 // Takes the provided storage_info, populates the fields related to the lite 304 // index and returns that storage_info. 305 // 306 // If an IO error occurs while trying to calculate the value for a field, then 307 // that field will be set to -1. 308 IndexStorageInfoProto GetStorageInfo(IndexStorageInfoProto storage_info) const 309 ICING_LOCKS_EXCLUDED(mutex_); 310 311 // Returns the size of unsorted part of hit_buffer_. GetHitBufferByteSize()312 uint32_t GetHitBufferByteSize() const ICING_LOCKS_EXCLUDED(mutex_) { 313 absl_ports::shared_lock l(&mutex_); 314 return size_impl() * sizeof(TermIdHitPair::Value); 315 } 316 317 // Returns the size of unsorted part of hit_buffer_. GetHitBufferUnsortedSize()318 uint32_t GetHitBufferUnsortedSize() const ICING_LOCKS_EXCLUDED(mutex_) { 319 absl_ports::shared_lock l(&mutex_); 320 return GetHitBufferUnsortedSizeImpl(); 321 } 322 323 // Returns the byte size of unsorted part of hit_buffer_. GetHitBufferUnsortedByteSize()324 uint64_t GetHitBufferUnsortedByteSize() const ICING_LOCKS_EXCLUDED(mutex_) { 325 absl_ports::shared_lock l(&mutex_); 326 return GetHitBufferUnsortedSizeImpl() * sizeof(TermIdHitPair::Value); 327 } 328 329 // Reduces internal file sizes by reclaiming space of deleted documents. 330 // 331 // This method also sets the last_added_docid of the index to 332 // new_last_added_document_id. 333 // 334 // Returns: 335 // OK on success 336 // INTERNAL_ERROR on IO error, this indicates that the index may be in an 337 // invalid state and should be cleared. 338 libtextclassifier3::Status Optimize( 339 const std::vector<DocumentId>& document_id_old_to_new, 340 const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) 341 ICING_LOCKS_EXCLUDED(mutex_); 342 343 private: 344 static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions(); 345 346 LiteIndex(const Options& options, const IcingFilesystem* filesystem); 347 348 // Initializes lite index from storage. Must be called exactly once after 349 // object construction. 350 // 351 // Returns: 352 // OK on success 353 // DATA_LOSS if the index was corrupted and cleared 354 // INTERNAL on I/O error 355 libtextclassifier3::Status Initialize() ICING_LOCKS_EXCLUDED(mutex_); 356 initialized()357 bool initialized() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 358 return header_ != nullptr; 359 } 360 361 // Check if the hit buffer has reached its capacity. 362 bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_); 363 364 // Non-locking implementation for empty(). empty_impl()365 bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 366 return size_impl() == 0; 367 } 368 369 // Non-locking implementation for size(). size_impl()370 uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 371 return header_->cur_size(); 372 } 373 374 // Calculate the checksum of all sub-components of the LiteIndex 375 Crc32 ComputeChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 376 377 // Sets the computed checksum in the header 378 void UpdateChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 379 380 // Non-locking implementation for UpdateTermProperties. 381 libtextclassifier3::Status UpdateTermPropertiesImpl(uint32_t tvi, 382 bool hasPrefixHits, 383 NamespaceId namespace_id) 384 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 385 386 // We need to sort during querying time when: 387 // 1. Sorting at indexing time is not enabled and there is an unsorted tail 388 // section in the HitBuffer. 389 // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless 390 // of whether or not hit_buffer_sort_at_indexing is enabled. This is to 391 // prevent performing sequential search on a large unsorted tail section, 392 // which would result in bad query performance. 393 // This is more of a sanity check. We should not really be encountering 394 // this case. NeedSortAtQuerying()395 bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 396 return HasUnsortedHitsExceedingSortThresholdImpl() || 397 (!options_.hit_buffer_sort_at_indexing && 398 GetHitBufferUnsortedSizeImpl() > 0); 399 } 400 401 // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl(). HasUnsortedHitsExceedingSortThresholdImpl()402 bool HasUnsortedHitsExceedingSortThresholdImpl() const 403 ICING_SHARED_LOCKS_REQUIRED(mutex_) { 404 return GetHitBufferUnsortedSizeImpl() >= 405 (options_.hit_buffer_sort_threshold_bytes / 406 sizeof(TermIdHitPair::Value)); 407 } 408 409 // Non-locking implementation for SortHits(). 410 void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 411 412 // Calculates and adds the score for a fetched hit to total_score_out, while 413 // updating last_document_id (which keeps track of the last added docId so 414 // far), and is_last_document_desired (which keeps track of whether that last 415 // added docId belongs to the query's desired namespace.) 416 // 417 // Also appends the hit to hits_out and term_frequency_out if the vectors are 418 // not null. 419 void ScoreAndAppendFetchedHit( 420 const Hit& hit, SectionIdMask section_id_mask, 421 bool only_from_prefix_sections, 422 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 423 const SuggestionResultChecker* suggestion_result_checker, 424 DocumentId& last_document_id, bool& is_last_document_desired, 425 int& total_score_out, std::vector<DocHitInfo>* hits_out, 426 std::vector<Hit::TermFrequencyArray>* term_frequency_out) const 427 ICING_SHARED_LOCKS_REQUIRED(mutex_); 428 429 // Returns the size of unsorted part of hit_buffer_. GetHitBufferUnsortedSizeImpl()430 uint32_t GetHitBufferUnsortedSizeImpl() const 431 ICING_SHARED_LOCKS_REQUIRED(mutex_) { 432 return header_->cur_size() - header_->searchable_end(); 433 } 434 435 // File descriptor that points to where the header and hit buffer are written 436 // to. 437 ScopedFd hit_buffer_fd_ ICING_GUARDED_BY(mutex_); 438 439 // Mmapped region past the header that stores the hits. 440 IcingArrayStorage hit_buffer_ ICING_GUARDED_BY(mutex_); 441 442 // Crc checksum of the hits, excludes the header. 443 uint32_t hit_buffer_crc_ ICING_GUARDED_BY(mutex_); 444 445 // Trie that maps indexed terms to their term id 446 IcingDynamicTrie lexicon_ ICING_GUARDED_BY(mutex_); 447 448 // TODO(b/140437260): Port over to MemoryMappedFile 449 // Memory mapped region of the underlying file that reflects the header. 450 IcingMMapper header_mmap_ ICING_GUARDED_BY(mutex_); 451 452 // Wrapper around the mmapped header that contains stats on the lite index. 453 std::unique_ptr<LiteIndex_Header> header_ ICING_GUARDED_BY(mutex_); 454 455 // Options used to initialize the LiteIndex. 456 const Options options_; 457 458 // TODO(b/139087650) Move to icing::Filesystem 459 const IcingFilesystem* const filesystem_; 460 461 // Used to provide reader and writer locks 462 mutable absl_ports::shared_mutex mutex_; 463 }; 464 465 } // namespace lib 466 } // namespace icing 467 468 #endif // ICING_INDEX_LITE_INDEX_H_ 469