• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // A small index with continuous updates (doesn't need explicit Flush
16 // to persiste) but has more possibility for corruption. It can always
17 // detect corruption reliably.
18 
19 #ifndef ICING_INDEX_LITE_INDEX_H_
20 #define ICING_INDEX_LITE_INDEX_H_
21 
22 #include <cstdint>
23 #include <iterator>
24 #include <limits>
25 #include <memory>
26 #include <string>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/mutex.h"
32 #include "icing/absl_ports/thread_annotations.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/index/hit/doc-hit-info.h"
35 #include "icing/index/hit/hit.h"
36 #include "icing/index/lite/lite-index-header.h"
37 #include "icing/index/lite/lite-index-options.h"
38 #include "icing/index/lite/term-id-hit-pair.h"
39 #include "icing/index/term-id-codec.h"
40 #include "icing/legacy/index/icing-array-storage.h"
41 #include "icing/legacy/index/icing-dynamic-trie.h"
42 #include "icing/legacy/index/icing-filesystem.h"
43 #include "icing/legacy/index/icing-mmapper.h"
44 #include "icing/proto/debug.pb.h"
45 #include "icing/proto/scoring.pb.h"
46 #include "icing/proto/storage.pb.h"
47 #include "icing/proto/term.pb.h"
48 #include "icing/schema/section.h"
49 #include "icing/store/document-id.h"
50 #include "icing/store/namespace-id.h"
51 #include "icing/store/suggestion-result-checker.h"
52 #include "icing/util/crc32.h"
53 
54 namespace icing {
55 namespace lib {
56 
57 // The LiteIndex is go/thread-compatible. Operations on the same data member
58 // object interfere with each other, unless they are guaranteed not to mutate
59 // the object (In the case of LiteIndex, this means all const methods,
60 // FetchHits and ScoreHits).
61 class LiteIndex {
62  public:
63   // An entry in the hit buffer.
64   using Options = LiteIndexOptions;
65 
66   // Offset for the LiteIndex_Header in the hit buffer mmap.
67   static constexpr uint32_t kHeaderFileOffset = 0;
68 
69   // Updates checksum of subcomponents.
70   ~LiteIndex();
71 
72   // Creates lite index from storage. The files will be created if they do not
73   // already exist.
74   //
75   // Returns:
76   //  OK on success
77   //  DATA_LOSS if the index was corrupted and cleared
78   //  INTERNAL on I/O error
79   static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create(
80       const Options& options, const IcingFilesystem* filesystem);
81 
82   // Resets all internal members of the index. Returns OK if all operations were
83   // successful.
84   libtextclassifier3::Status Reset() ICING_LOCKS_EXCLUDED(mutex_);
85 
86   // Advises the OS to cache pages in the index, which will be accessed for a
87   // query soon.
88   void Warm() ICING_LOCKS_EXCLUDED(mutex_);
89 
90   // Syncs all modified files in the index to disk.
91   //
92   // Returns:
93   //   OK on success
94   //   INTERNAL on I/O error
95   libtextclassifier3::Status PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
96 
97   // Returns term_id if term found, NOT_FOUND otherwise.
98   libtextclassifier3::StatusOr<uint32_t> GetTermId(
99       const std::string& term) const ICING_LOCKS_EXCLUDED(mutex_);
100 
101   // Returns an iterator for all terms for which 'prefix' is a prefix.
102   class PrefixIterator {
103    public:
PrefixIterator(const IcingDynamicTrie::Iterator & delegate)104     explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate)
105         : delegate_(delegate) {}
IsValid()106     bool IsValid() const { return delegate_.IsValid(); }
107 
Advance()108     void Advance() { delegate_.Advance(); }
109 
GetKey()110     const char* GetKey() const { return delegate_.GetKey(); }
111 
GetValueIndex()112     uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); }
113 
114    private:
115     IcingDynamicTrie::Iterator delegate_;
116   };
117 
118   // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate any
119   // previously returned PrefixIterator.
FindTermPrefixes(const std::string & prefix)120   PrefixIterator FindTermPrefixes(const std::string& prefix) const
121       ICING_LOCKS_EXCLUDED(mutex_) {
122     absl_ports::shared_lock l(&mutex_);
123     return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str()));
124   }
125 
126   // Inserts a term with its properties.
127   //
128   // Returns:
129   //   A value index on success
130   //   RESOURCE_EXHAUSTED if lexicon is full or no disk space is available
131   libtextclassifier3::StatusOr<uint32_t> InsertTerm(
132       const std::string& term, TermMatchType::Code term_match_type,
133       NamespaceId namespace_id) ICING_LOCKS_EXCLUDED(mutex_);
134 
135   // Updates term properties by setting hasPrefixHits and namespace id of the
136   // term.
137   //
138   // Returns:
139   //   OK on success
140   //   RESOURCE_EXHAUSTED if no disk space is available
141   libtextclassifier3::Status UpdateTermProperties(uint32_t tvi,
142                                                   bool hasPrefixHits,
143                                                   NamespaceId namespace_id)
144       ICING_LOCKS_EXCLUDED(mutex_);
145 
146   // Append hit to buffer. term_id must be encoded using the same term_id_codec
147   // supplied to the index constructor.
148   // RETURNS:
149   //  - OK if hit was successfully added
150   //  - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer
151   //    or file system capacity reached).
152   libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit)
153       ICING_LOCKS_EXCLUDED(mutex_);
154 
155   // Add all hits with term_id from the sections specified in section_id_mask,
156   // skipping hits in non-prefix sections if only_from_prefix_sections is true,
157   // to hits_out. If hits_out is nullptr, no hits will be added. The
158   // corresponding hit term frequencies will also not be added if
159   // term_frequency_out is nullptr.
160   //
161   // Only those hits which belongs to the given namespaces will be counted and
162   // fetched. A nullptr namespace checker will disable this check.
163   //
164   // Returns the score of hits that would be added to hits_out according the
165   // given score_by.
166   int FetchHits(
167       uint32_t term_id, SectionIdMask section_id_mask,
168       bool only_from_prefix_sections,
169       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
170       const SuggestionResultChecker* suggestion_result_checker,
171       std::vector<DocHitInfo>* hits_out,
172       std::vector<Hit::TermFrequencyArray>* term_frequency_out = nullptr)
173       ICING_LOCKS_EXCLUDED(mutex_);
174 
175   // Returns the hit count of the term.
176   // Only those hits which belongs to the given namespaces will be counted.
177   libtextclassifier3::StatusOr<int> ScoreHits(
178       uint32_t term_id,
179       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
180       const SuggestionResultChecker* suggestion_result_checker)
181       ICING_LOCKS_EXCLUDED(mutex_);
182 
empty()183   bool empty() const ICING_LOCKS_EXCLUDED(mutex_) { return size() == 0; }
184 
size()185   uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) {
186     absl_ports::shared_lock l(&mutex_);
187     return size_impl();
188   }
189 
WantsMerge()190   bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) {
191     absl_ports::shared_lock l(&mutex_);
192     return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes /
193                                         sizeof(TermIdHitPair::Value));
194   }
195 
196   // Whether or not the HitBuffer's unsorted tail size exceeds the sort
197   // threshold.
HasUnsortedHitsExceedingSortThreshold()198   bool HasUnsortedHitsExceedingSortThreshold() const
199       ICING_LOCKS_EXCLUDED(mutex_) {
200     absl_ports::shared_lock l(&mutex_);
201     return HasUnsortedHitsExceedingSortThresholdImpl();
202   }
203 
204   // Sort hits stored in the index.
SortHits()205   void SortHits() ICING_LOCKS_EXCLUDED(mutex_) {
206     absl_ports::unique_lock l(&mutex_);
207     SortHitsImpl();
208   };
209 
210   class const_iterator {
211     friend class LiteIndex;
212 
213    public:
214     using iterator_category = std::forward_iterator_tag;
215     using value_type = TermIdHitPair;
216     using reference = const value_type&;
217     using pointer = const value_type*;
218 
const_iterator()219     const_iterator() : const_iterator(nullptr, -1, -1) {}
220 
221     reference operator*() const { return start_[position_]; }
222 
223     pointer operator->() const { return start_ + position_; }
224 
225     const_iterator& operator++() {
226       if (++position_ >= end_position_) {
227         start_ = nullptr;
228         position_ = -1;
229         end_position_ = -1;
230       }
231       return *this;
232     }
233 
234     const_iterator operator++(int) {
235       auto tmp = *this;
236       ++*this;
237       return tmp;
238     }
239 
240     bool operator!=(const const_iterator& rhs) { return !(*this == rhs); }
241 
242     bool operator==(const const_iterator& rhs) {
243       return start_ == rhs.start_ && position_ == rhs.position_;
244     }
245 
246    private:
const_iterator(const TermIdHitPair * start,int position,int end_position)247     explicit const_iterator(const TermIdHitPair* start, int position,
248                             int end_position)
249         : start_(start), position_(position), end_position_(end_position) {}
250 
251     const TermIdHitPair* start_;
252     int position_;
253     int end_position_;
254   };
255 
begin()256   const_iterator begin() const ICING_LOCKS_EXCLUDED(mutex_) {
257     absl_ports::shared_lock l(&mutex_);
258     // If the LiteIndex is empty, just return end().
259     return empty_impl()
260                ? end()
261                : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0,
262                                 header_->cur_size());
263   }
264 
end()265   const_iterator end() const { return const_iterator(); }
266 
max_hit_buffer_size()267   constexpr static uint32_t max_hit_buffer_size() {
268     return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair);
269   }
270 
271   // We keep track of the last added document_id. This is always the largest
272   // document_id that has been added because hits can only be added in order of
273   // increasing document_id.
last_added_document_id()274   DocumentId last_added_document_id() const ICING_LOCKS_EXCLUDED(mutex_) {
275     absl_ports::shared_lock l(&mutex_);
276     return header_->last_added_docid();
277   }
set_last_added_document_id(DocumentId document_id)278   void set_last_added_document_id(DocumentId document_id)
279       ICING_LOCKS_EXCLUDED(mutex_) {
280     absl_ports::unique_lock l(&mutex_);
281     header_->set_last_added_docid(document_id);
282   }
283 
284   // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate the reference
285   // returned here.
lexicon()286   const IcingDynamicTrie& lexicon() const { return lexicon_; }
287 
288   // Returns debug information for the index in out.
289   // verbosity = BASIC, simplest debug information - size of lexicon, hit buffer
290   // verbosity = DETAILED, more detailed debug information from the lexicon.
291   std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity)
292       ICING_LOCKS_EXCLUDED(mutex_);
293 
294   // Returns the byte size of all the elements held in the index. This excludes
295   // the size of any internal metadata of the index, e.g. the index's header.
296   //
297   // Returns:
298   //   Byte size on success
299   //   INTERNAL_ERROR on IO error
300   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const
301       ICING_LOCKS_EXCLUDED(mutex_);
302 
303   // Takes the provided storage_info, populates the fields related to the lite
304   // index and returns that storage_info.
305   //
306   // If an IO error occurs while trying to calculate the value for a field, then
307   // that field will be set to -1.
308   IndexStorageInfoProto GetStorageInfo(IndexStorageInfoProto storage_info) const
309       ICING_LOCKS_EXCLUDED(mutex_);
310 
311   // Returns the size of unsorted part of hit_buffer_.
GetHitBufferByteSize()312   uint32_t GetHitBufferByteSize() const ICING_LOCKS_EXCLUDED(mutex_) {
313     absl_ports::shared_lock l(&mutex_);
314     return size_impl() * sizeof(TermIdHitPair::Value);
315   }
316 
317   // Returns the size of unsorted part of hit_buffer_.
GetHitBufferUnsortedSize()318   uint32_t GetHitBufferUnsortedSize() const ICING_LOCKS_EXCLUDED(mutex_) {
319     absl_ports::shared_lock l(&mutex_);
320     return GetHitBufferUnsortedSizeImpl();
321   }
322 
323   // Returns the byte size of unsorted part of hit_buffer_.
GetHitBufferUnsortedByteSize()324   uint64_t GetHitBufferUnsortedByteSize() const ICING_LOCKS_EXCLUDED(mutex_) {
325     absl_ports::shared_lock l(&mutex_);
326     return GetHitBufferUnsortedSizeImpl() * sizeof(TermIdHitPair::Value);
327   }
328 
329   // Reduces internal file sizes by reclaiming space of deleted documents.
330   //
331   // This method also sets the last_added_docid of the index to
332   // new_last_added_document_id.
333   //
334   // Returns:
335   //   OK on success
336   //   INTERNAL_ERROR on IO error, this indicates that the index may be in an
337   //                               invalid state and should be cleared.
338   libtextclassifier3::Status Optimize(
339       const std::vector<DocumentId>& document_id_old_to_new,
340       const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id)
341       ICING_LOCKS_EXCLUDED(mutex_);
342 
343  private:
344   static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
345 
346   LiteIndex(const Options& options, const IcingFilesystem* filesystem);
347 
348   // Initializes lite index from storage. Must be called exactly once after
349   // object construction.
350   //
351   // Returns:
352   //  OK on success
353   //  DATA_LOSS if the index was corrupted and cleared
354   //  INTERNAL on I/O error
355   libtextclassifier3::Status Initialize() ICING_LOCKS_EXCLUDED(mutex_);
356 
initialized()357   bool initialized() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
358     return header_ != nullptr;
359   }
360 
361   // Check if the hit buffer has reached its capacity.
362   bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_);
363 
364   // Non-locking implementation for empty().
empty_impl()365   bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
366     return size_impl() == 0;
367   }
368 
369   // Non-locking implementation for size().
size_impl()370   uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
371     return header_->cur_size();
372   }
373 
374   // Calculate the checksum of all sub-components of the LiteIndex
375   Crc32 ComputeChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
376 
377   // Sets the computed checksum in the header
378   void UpdateChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
379 
380   // Non-locking implementation for UpdateTermProperties.
381   libtextclassifier3::Status UpdateTermPropertiesImpl(uint32_t tvi,
382                                                       bool hasPrefixHits,
383                                                       NamespaceId namespace_id)
384       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
385 
386   // We need to sort during querying time when:
387   // 1. Sorting at indexing time is not enabled and there is an unsorted tail
388   //    section in the HitBuffer.
389   // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless
390   //    of whether or not hit_buffer_sort_at_indexing is enabled. This is to
391   //    prevent performing sequential search on a large unsorted tail section,
392   //    which would result in bad query performance.
393   //    This is more of a sanity check. We should not really be encountering
394   //    this case.
NeedSortAtQuerying()395   bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
396     return HasUnsortedHitsExceedingSortThresholdImpl() ||
397            (!options_.hit_buffer_sort_at_indexing &&
398             GetHitBufferUnsortedSizeImpl() > 0);
399   }
400 
401   // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl().
HasUnsortedHitsExceedingSortThresholdImpl()402   bool HasUnsortedHitsExceedingSortThresholdImpl() const
403       ICING_SHARED_LOCKS_REQUIRED(mutex_) {
404     return GetHitBufferUnsortedSizeImpl() >=
405            (options_.hit_buffer_sort_threshold_bytes /
406             sizeof(TermIdHitPair::Value));
407   }
408 
409   // Non-locking implementation for SortHits().
410   void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
411 
412   // Calculates and adds the score for a fetched hit to total_score_out, while
413   // updating last_document_id (which keeps track of the last added docId so
414   // far), and is_last_document_desired (which keeps track of whether that last
415   // added docId belongs to the query's desired namespace.)
416   //
417   // Also appends the hit to hits_out and term_frequency_out if the vectors are
418   // not null.
419   void ScoreAndAppendFetchedHit(
420       const Hit& hit, SectionIdMask section_id_mask,
421       bool only_from_prefix_sections,
422       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
423       const SuggestionResultChecker* suggestion_result_checker,
424       DocumentId& last_document_id, bool& is_last_document_desired,
425       int& total_score_out, std::vector<DocHitInfo>* hits_out,
426       std::vector<Hit::TermFrequencyArray>* term_frequency_out) const
427       ICING_SHARED_LOCKS_REQUIRED(mutex_);
428 
429   // Returns the size of unsorted part of hit_buffer_.
GetHitBufferUnsortedSizeImpl()430   uint32_t GetHitBufferUnsortedSizeImpl() const
431       ICING_SHARED_LOCKS_REQUIRED(mutex_) {
432     return header_->cur_size() - header_->searchable_end();
433   }
434 
435   // File descriptor that points to where the header and hit buffer are written
436   // to.
437   ScopedFd hit_buffer_fd_ ICING_GUARDED_BY(mutex_);
438 
439   // Mmapped region past the header that stores the hits.
440   IcingArrayStorage hit_buffer_ ICING_GUARDED_BY(mutex_);
441 
442   // Crc checksum of the hits, excludes the header.
443   uint32_t hit_buffer_crc_ ICING_GUARDED_BY(mutex_);
444 
445   // Trie that maps indexed terms to their term id
446   IcingDynamicTrie lexicon_ ICING_GUARDED_BY(mutex_);
447 
448   // TODO(b/140437260): Port over to MemoryMappedFile
449   // Memory mapped region of the underlying file that reflects the header.
450   IcingMMapper header_mmap_ ICING_GUARDED_BY(mutex_);
451 
452   // Wrapper around the mmapped header that contains stats on the lite index.
453   std::unique_ptr<LiteIndex_Header> header_ ICING_GUARDED_BY(mutex_);
454 
455   // Options used to initialize the LiteIndex.
456   const Options options_;
457 
458   // TODO(b/139087650) Move to icing::Filesystem
459   const IcingFilesystem* const filesystem_;
460 
461   // Used to provide reader and writer locks
462   mutable absl_ports::shared_mutex mutex_;
463 };
464 
465 }  // namespace lib
466 }  // namespace icing
467 
468 #endif  // ICING_INDEX_LITE_INDEX_H_
469