1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // A small index with continuous updates (doesn't need explicit Flush 16 // to persiste) but has more possibility for corruption. It can always 17 // detect corruption reliably. 18 19 #ifndef ICING_INDEX_LITE_INDEX_H_ 20 #define ICING_INDEX_LITE_INDEX_H_ 21 22 #include <cstdint> 23 #include <limits> 24 #include <memory> 25 #include <string> 26 #include <vector> 27 28 #include "icing/text_classifier/lib3/utils/base/status.h" 29 #include "icing/text_classifier/lib3/utils/base/statusor.h" 30 #include "icing/absl_ports/mutex.h" 31 #include "icing/absl_ports/thread_annotations.h" 32 #include "icing/file/filesystem.h" 33 #include "icing/index/hit/doc-hit-info.h" 34 #include "icing/index/hit/hit.h" 35 #include "icing/index/lite/lite-index-header.h" 36 #include "icing/index/lite/lite-index-options.h" 37 #include "icing/index/lite/term-id-hit-pair.h" 38 #include "icing/index/term-id-codec.h" 39 #include "icing/legacy/index/icing-array-storage.h" 40 #include "icing/legacy/index/icing-dynamic-trie.h" 41 #include "icing/legacy/index/icing-filesystem.h" 42 #include "icing/legacy/index/icing-mmapper.h" 43 #include "icing/proto/debug.pb.h" 44 #include "icing/proto/scoring.pb.h" 45 #include "icing/proto/storage.pb.h" 46 #include "icing/proto/term.pb.h" 47 #include "icing/schema/section.h" 48 #include "icing/store/document-id.h" 49 #include "icing/store/namespace-id.h" 50 #include "icing/store/suggestion-result-checker.h" 51 #include "icing/util/bit-util.h" 52 #include "icing/util/crc32.h" 53 54 namespace icing { 55 namespace lib { 56 57 // The LiteIndex is go/thread-compatible. Operations on the same data member 58 // object interfere with each other, unless they are guaranteed not to mutate 59 // the object (In the case of LiteIndex, this means all const methods, 60 // FetchHits and ScoreHits). 61 class LiteIndex { 62 public: 63 // An entry in the hit buffer. 64 using Options = LiteIndexOptions; 65 66 // Updates checksum of subcomponents. 67 ~LiteIndex(); 68 69 // Creates lite index from storage. The files will be created if they do not 70 // already exist. 71 // 72 // Returns: 73 // OK on success 74 // DATA_LOSS if the index was corrupted and cleared 75 // INTERNAL on I/O error 76 static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create( 77 const Options& options, const IcingFilesystem* filesystem); 78 79 // Resets all internal members of the index. Returns OK if all operations were 80 // successful. 81 libtextclassifier3::Status Reset() ICING_LOCKS_EXCLUDED(mutex_); 82 83 // Advises the OS to cache pages in the index, which will be accessed for a 84 // query soon. 85 void Warm() ICING_LOCKS_EXCLUDED(mutex_); 86 87 // Syncs all modified files in the index to disk. 88 // 89 // Returns: 90 // OK on success 91 // INTERNAL on I/O error 92 libtextclassifier3::Status PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_); 93 94 // Returns term_id if term found, NOT_FOUND otherwise. 95 libtextclassifier3::StatusOr<uint32_t> GetTermId( 96 const std::string& term) const ICING_LOCKS_EXCLUDED(mutex_); 97 98 // Returns an iterator for all terms for which 'prefix' is a prefix. 99 class PrefixIterator { 100 public: PrefixIterator(const IcingDynamicTrie::Iterator & delegate)101 explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate) 102 : delegate_(delegate) {} IsValid()103 bool IsValid() const { return delegate_.IsValid(); } 104 Advance()105 void Advance() { delegate_.Advance(); } 106 GetKey()107 const char* GetKey() const { return delegate_.GetKey(); } 108 GetValueIndex()109 uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); } 110 111 private: 112 IcingDynamicTrie::Iterator delegate_; 113 }; 114 115 // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate any 116 // previously returned PrefixIterator. FindTermPrefixes(const std::string & prefix)117 PrefixIterator FindTermPrefixes(const std::string& prefix) const 118 ICING_LOCKS_EXCLUDED(mutex_) { 119 absl_ports::shared_lock l(&mutex_); 120 return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str())); 121 } 122 123 // Inserts a term with its properties. 124 // 125 // Returns: 126 // A value index on success 127 // RESOURCE_EXHAUSTED if lexicon is full or no disk space is available 128 libtextclassifier3::StatusOr<uint32_t> InsertTerm( 129 const std::string& term, TermMatchType::Code term_match_type, 130 NamespaceId namespace_id) ICING_LOCKS_EXCLUDED(mutex_); 131 132 // Updates term properties by setting hasPrefixHits and namespace id of the 133 // term. 134 // 135 // Returns: 136 // OK on success 137 // RESOURCE_EXHAUSTED if no disk space is available 138 libtextclassifier3::Status UpdateTermProperties(uint32_t tvi, 139 bool hasPrefixHits, 140 NamespaceId namespace_id) 141 ICING_LOCKS_EXCLUDED(mutex_); 142 143 // Append hit to buffer. term_id must be encoded using the same term_id_codec 144 // supplied to the index constructor. 145 // RETURNS: 146 // - OK if hit was successfully added 147 // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer 148 // or file system capacity reached). 149 libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit) 150 ICING_LOCKS_EXCLUDED(mutex_); 151 152 // Add all hits with term_id from the sections specified in section_id_mask, 153 // skipping hits in non-prefix sections if only_from_prefix_sections is true, 154 // to hits_out. If hits_out is nullptr, no hits will be added. The 155 // corresponding hit term frequencies will also be added if term_frequency_out 156 // is nullptr. 157 // 158 // Only those hits which belongs to the given namespaces will be counted and 159 // fetched. A nullptr namespace checker will disable this check. 160 // 161 // Returns the score of hits that would be added to hits_out according the 162 // given score_by. 163 int FetchHits( 164 uint32_t term_id, SectionIdMask section_id_mask, 165 bool only_from_prefix_sections, 166 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 167 const SuggestionResultChecker* suggestion_result_checker, 168 std::vector<DocHitInfo>* hits_out, 169 std::vector<Hit::TermFrequencyArray>* term_frequency_out = nullptr) 170 ICING_LOCKS_EXCLUDED(mutex_); 171 172 // Returns the hit count of the term. 173 // Only those hits which belongs to the given namespaces will be counted. 174 libtextclassifier3::StatusOr<int> ScoreHits( 175 uint32_t term_id, 176 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 177 const SuggestionResultChecker* suggestion_result_checker) 178 ICING_LOCKS_EXCLUDED(mutex_); 179 empty()180 bool empty() const ICING_LOCKS_EXCLUDED(mutex_) { return size() == 0; } 181 size()182 uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) { 183 absl_ports::shared_lock l(&mutex_); 184 return sizeLocked(); 185 } 186 WantsMerge()187 bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) { 188 absl_ports::shared_lock l(&mutex_); 189 return is_full() || sizeLocked() >= (options_.hit_buffer_want_merge_bytes / 190 sizeof(TermIdHitPair::Value)); 191 } 192 193 class const_iterator { 194 friend class LiteIndex; 195 196 public: 197 using iterator_category = std::forward_iterator_tag; 198 using value_type = TermIdHitPair; 199 using reference = const value_type&; 200 using pointer = const value_type*; 201 const_iterator()202 const_iterator() : const_iterator(nullptr, -1, -1) {} 203 204 reference operator*() const { return start_[position_]; } 205 206 pointer operator->() const { return start_ + position_; } 207 208 const_iterator& operator++() { 209 if (++position_ >= end_position_) { 210 start_ = nullptr; 211 position_ = -1; 212 end_position_ = -1; 213 } 214 return *this; 215 } 216 217 const_iterator operator++(int) { 218 auto tmp = *this; 219 ++*this; 220 return tmp; 221 } 222 223 bool operator!=(const const_iterator& rhs) { return !(*this == rhs); } 224 225 bool operator==(const const_iterator& rhs) { 226 return start_ == rhs.start_ && position_ == rhs.position_; 227 } 228 229 private: const_iterator(const TermIdHitPair * start,int position,int end_position)230 explicit const_iterator(const TermIdHitPair* start, int position, 231 int end_position) 232 : start_(start), position_(position), end_position_(end_position) {} 233 234 const TermIdHitPair* start_; 235 int position_; 236 int end_position_; 237 }; 238 begin()239 const_iterator begin() const ICING_LOCKS_EXCLUDED(mutex_) { 240 absl_ports::shared_lock l(&mutex_); 241 // If the LiteIndex is empty, just return end(). 242 return empty_impl() 243 ? end() 244 : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0, 245 header_->cur_size()); 246 } 247 end()248 const_iterator end() const { return const_iterator(); } 249 max_hit_buffer_size()250 constexpr static uint32_t max_hit_buffer_size() { 251 return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair); 252 } 253 254 // We keep track of the last added document_id. This is always the largest 255 // document_id that has been added because hits can only be added in order of 256 // increasing document_id. last_added_document_id()257 DocumentId last_added_document_id() const ICING_LOCKS_EXCLUDED(mutex_) { 258 absl_ports::shared_lock l(&mutex_); 259 return header_->last_added_docid(); 260 } set_last_added_document_id(DocumentId document_id)261 void set_last_added_document_id(DocumentId document_id) 262 ICING_LOCKS_EXCLUDED(mutex_) { 263 absl_ports::unique_lock l(&mutex_); 264 header_->set_last_added_docid(document_id); 265 } 266 267 // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate the reference 268 // returned here. lexicon()269 const IcingDynamicTrie& lexicon() const { return lexicon_; } 270 271 // Returns debug information for the index in out. 272 // verbosity = BASIC, simplest debug information - size of lexicon, hit buffer 273 // verbosity = DETAILED, more detailed debug information from the lexicon. 274 std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) 275 ICING_LOCKS_EXCLUDED(mutex_); 276 277 // Returns the byte size of all the elements held in the index. This excludes 278 // the size of any internal metadata of the index, e.g. the index's header. 279 // 280 // Returns: 281 // Byte size on success 282 // INTERNAL_ERROR on IO error 283 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const 284 ICING_LOCKS_EXCLUDED(mutex_); 285 286 // Takes the provided storage_info, populates the fields related to the lite 287 // index and returns that storage_info. 288 // 289 // If an IO error occurs while trying to calculate the value for a field, then 290 // that field will be set to -1. 291 IndexStorageInfoProto GetStorageInfo(IndexStorageInfoProto storage_info) const 292 ICING_LOCKS_EXCLUDED(mutex_); 293 294 // Reduces internal file sizes by reclaiming space of deleted documents. 295 // 296 // This method also sets the last_added_docid of the index to 297 // new_last_added_document_id. 298 // 299 // Returns: 300 // OK on success 301 // INTERNAL_ERROR on IO error, this indicates that the index may be in an 302 // invalid state and should be cleared. 303 libtextclassifier3::Status Optimize( 304 const std::vector<DocumentId>& document_id_old_to_new, 305 const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) 306 ICING_LOCKS_EXCLUDED(mutex_); 307 308 private: 309 static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions(); 310 311 LiteIndex(const Options& options, const IcingFilesystem* filesystem); 312 313 // Initializes lite index from storage. Must be called exactly once after 314 // object construction. 315 // 316 // Returns: 317 // OK on success 318 // DATA_LOSS if the index was corrupted and cleared 319 // INTERNAL on I/O error 320 libtextclassifier3::Status Initialize() ICING_LOCKS_EXCLUDED(mutex_); 321 initialized()322 bool initialized() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 323 return header_ != nullptr; 324 } 325 326 // Check if the hit buffer has reached its capacity. 327 bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_); 328 sizeLocked()329 uint32_t sizeLocked() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 330 return header_->cur_size(); 331 } 332 333 // Non-locking implementation for empty(). empty_impl()334 bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 335 return size_impl() == 0; 336 } 337 338 // Non-locking implementation for size(). size_impl()339 bool size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 340 return header_->cur_size(); 341 } 342 343 // Calculate the checksum of all sub-components of the LiteIndex 344 Crc32 ComputeChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 345 346 // Sets the computed checksum in the header 347 void UpdateChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 348 349 // Non-locking implementation for UpdateTermProperties. 350 libtextclassifier3::Status UpdateTermPropertiesImpl(uint32_t tvi, 351 bool hasPrefixHits, 352 NamespaceId namespace_id) 353 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 354 355 // Whether or not the HitBuffer requires sorting. NeedSort()356 bool NeedSort() ICING_LOCKS_EXCLUDED(mutex_) { 357 absl_ports::shared_lock l(&mutex_); 358 return header_->cur_size() - header_->searchable_end() > 0; 359 } 360 361 // Sort hits stored in the index. 362 void SortHits() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 363 364 // Returns the position of the first element with term_id, or the searchable 365 // end of the hit buffer if term_id is not present. 366 uint32_t Seek(uint32_t term_id) const ICING_SHARED_LOCKS_REQUIRED(mutex_); 367 368 // File descriptor that points to where the header and hit buffer are written 369 // to. 370 ScopedFd hit_buffer_fd_ ICING_GUARDED_BY(mutex_); 371 372 // Mmapped region past the header that stores the hits. 373 IcingArrayStorage hit_buffer_ ICING_GUARDED_BY(mutex_); 374 375 // Crc checksum of the hits, excludes the header. 376 uint32_t hit_buffer_crc_ ICING_GUARDED_BY(mutex_); 377 378 // Trie that maps indexed terms to their term id 379 IcingDynamicTrie lexicon_ ICING_GUARDED_BY(mutex_); 380 381 // TODO(b/140437260): Port over to MemoryMappedFile 382 // Memory mapped region of the underlying file that reflects the header. 383 IcingMMapper header_mmap_ ICING_GUARDED_BY(mutex_); 384 385 // Wrapper around the mmapped header that contains stats on the lite index. 386 std::unique_ptr<LiteIndex_Header> header_ ICING_GUARDED_BY(mutex_); 387 388 // Options used to initialize the LiteIndex. 389 const Options options_; 390 391 // TODO(b/139087650) Move to icing::Filesystem 392 const IcingFilesystem* const filesystem_; 393 394 // Used to provide reader and writer locks 395 mutable absl_ports::shared_mutex mutex_; 396 }; 397 398 } // namespace lib 399 } // namespace icing 400 401 #endif // ICING_INDEX_LITE_INDEX_H_ 402