• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // A small index with continuous updates (doesn't need explicit Flush
16 // to persiste) but has more possibility for corruption. It can always
17 // detect corruption reliably.
18 
19 #ifndef ICING_INDEX_LITE_INDEX_H_
20 #define ICING_INDEX_LITE_INDEX_H_
21 
22 #include <cstdint>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <vector>
27 
28 #include "icing/text_classifier/lib3/utils/base/status.h"
29 #include "icing/text_classifier/lib3/utils/base/statusor.h"
30 #include "icing/absl_ports/mutex.h"
31 #include "icing/absl_ports/thread_annotations.h"
32 #include "icing/file/filesystem.h"
33 #include "icing/index/hit/doc-hit-info.h"
34 #include "icing/index/hit/hit.h"
35 #include "icing/index/lite/lite-index-header.h"
36 #include "icing/index/lite/lite-index-options.h"
37 #include "icing/index/lite/term-id-hit-pair.h"
38 #include "icing/index/term-id-codec.h"
39 #include "icing/legacy/index/icing-array-storage.h"
40 #include "icing/legacy/index/icing-dynamic-trie.h"
41 #include "icing/legacy/index/icing-filesystem.h"
42 #include "icing/legacy/index/icing-mmapper.h"
43 #include "icing/proto/debug.pb.h"
44 #include "icing/proto/scoring.pb.h"
45 #include "icing/proto/storage.pb.h"
46 #include "icing/proto/term.pb.h"
47 #include "icing/schema/section.h"
48 #include "icing/store/document-id.h"
49 #include "icing/store/namespace-id.h"
50 #include "icing/store/suggestion-result-checker.h"
51 #include "icing/util/bit-util.h"
52 #include "icing/util/crc32.h"
53 
54 namespace icing {
55 namespace lib {
56 
57 // The LiteIndex is go/thread-compatible. Operations on the same data member
58 // object interfere with each other, unless they are guaranteed not to mutate
59 // the object (In the case of LiteIndex, this means all const methods,
60 // FetchHits and ScoreHits).
61 class LiteIndex {
62  public:
63   // An entry in the hit buffer.
64   using Options = LiteIndexOptions;
65 
66   // Updates checksum of subcomponents.
67   ~LiteIndex();
68 
69   // Creates lite index from storage. The files will be created if they do not
70   // already exist.
71   //
72   // Returns:
73   //  OK on success
74   //  DATA_LOSS if the index was corrupted and cleared
75   //  INTERNAL on I/O error
76   static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create(
77       const Options& options, const IcingFilesystem* filesystem);
78 
79   // Resets all internal members of the index. Returns OK if all operations were
80   // successful.
81   libtextclassifier3::Status Reset() ICING_LOCKS_EXCLUDED(mutex_);
82 
83   // Advises the OS to cache pages in the index, which will be accessed for a
84   // query soon.
85   void Warm() ICING_LOCKS_EXCLUDED(mutex_);
86 
87   // Syncs all modified files in the index to disk.
88   //
89   // Returns:
90   //   OK on success
91   //   INTERNAL on I/O error
92   libtextclassifier3::Status PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
93 
94   // Returns term_id if term found, NOT_FOUND otherwise.
95   libtextclassifier3::StatusOr<uint32_t> GetTermId(
96       const std::string& term) const ICING_LOCKS_EXCLUDED(mutex_);
97 
98   // Returns an iterator for all terms for which 'prefix' is a prefix.
99   class PrefixIterator {
100    public:
PrefixIterator(const IcingDynamicTrie::Iterator & delegate)101     explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate)
102         : delegate_(delegate) {}
IsValid()103     bool IsValid() const { return delegate_.IsValid(); }
104 
Advance()105     void Advance() { delegate_.Advance(); }
106 
GetKey()107     const char* GetKey() const { return delegate_.GetKey(); }
108 
GetValueIndex()109     uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); }
110 
111    private:
112     IcingDynamicTrie::Iterator delegate_;
113   };
114 
115   // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate any
116   // previously returned PrefixIterator.
FindTermPrefixes(const std::string & prefix)117   PrefixIterator FindTermPrefixes(const std::string& prefix) const
118       ICING_LOCKS_EXCLUDED(mutex_) {
119     absl_ports::shared_lock l(&mutex_);
120     return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str()));
121   }
122 
123   // Inserts a term with its properties.
124   //
125   // Returns:
126   //   A value index on success
127   //   RESOURCE_EXHAUSTED if lexicon is full or no disk space is available
128   libtextclassifier3::StatusOr<uint32_t> InsertTerm(
129       const std::string& term, TermMatchType::Code term_match_type,
130       NamespaceId namespace_id) ICING_LOCKS_EXCLUDED(mutex_);
131 
132   // Updates term properties by setting hasPrefixHits and namespace id of the
133   // term.
134   //
135   // Returns:
136   //   OK on success
137   //   RESOURCE_EXHAUSTED if no disk space is available
138   libtextclassifier3::Status UpdateTermProperties(uint32_t tvi,
139                                                   bool hasPrefixHits,
140                                                   NamespaceId namespace_id)
141       ICING_LOCKS_EXCLUDED(mutex_);
142 
143   // Append hit to buffer. term_id must be encoded using the same term_id_codec
144   // supplied to the index constructor.
145   // RETURNS:
146   //  - OK if hit was successfully added
147   //  - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer
148   //    or file system capacity reached).
149   libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit)
150       ICING_LOCKS_EXCLUDED(mutex_);
151 
152   // Add all hits with term_id from the sections specified in section_id_mask,
153   // skipping hits in non-prefix sections if only_from_prefix_sections is true,
154   // to hits_out. If hits_out is nullptr, no hits will be added. The
155   // corresponding hit term frequencies will also be added if term_frequency_out
156   // is nullptr.
157   //
158   // Only those hits which belongs to the given namespaces will be counted and
159   // fetched. A nullptr namespace checker will disable this check.
160   //
161   // Returns the score of hits that would be added to hits_out according the
162   // given score_by.
163   int FetchHits(
164       uint32_t term_id, SectionIdMask section_id_mask,
165       bool only_from_prefix_sections,
166       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
167       const SuggestionResultChecker* suggestion_result_checker,
168       std::vector<DocHitInfo>* hits_out,
169       std::vector<Hit::TermFrequencyArray>* term_frequency_out = nullptr)
170       ICING_LOCKS_EXCLUDED(mutex_);
171 
172   // Returns the hit count of the term.
173   // Only those hits which belongs to the given namespaces will be counted.
174   libtextclassifier3::StatusOr<int> ScoreHits(
175       uint32_t term_id,
176       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
177       const SuggestionResultChecker* suggestion_result_checker)
178       ICING_LOCKS_EXCLUDED(mutex_);
179 
empty()180   bool empty() const ICING_LOCKS_EXCLUDED(mutex_) { return size() == 0; }
181 
size()182   uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) {
183     absl_ports::shared_lock l(&mutex_);
184     return sizeLocked();
185   }
186 
WantsMerge()187   bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) {
188     absl_ports::shared_lock l(&mutex_);
189     return is_full() || sizeLocked() >= (options_.hit_buffer_want_merge_bytes /
190                         sizeof(TermIdHitPair::Value));
191   }
192 
193   class const_iterator {
194     friend class LiteIndex;
195 
196    public:
197     using iterator_category = std::forward_iterator_tag;
198     using value_type = TermIdHitPair;
199     using reference = const value_type&;
200     using pointer = const value_type*;
201 
const_iterator()202     const_iterator() : const_iterator(nullptr, -1, -1) {}
203 
204     reference operator*() const { return start_[position_]; }
205 
206     pointer operator->() const { return start_ + position_; }
207 
208     const_iterator& operator++() {
209       if (++position_ >= end_position_) {
210         start_ = nullptr;
211         position_ = -1;
212         end_position_ = -1;
213       }
214       return *this;
215     }
216 
217     const_iterator operator++(int) {
218       auto tmp = *this;
219       ++*this;
220       return tmp;
221     }
222 
223     bool operator!=(const const_iterator& rhs) { return !(*this == rhs); }
224 
225     bool operator==(const const_iterator& rhs) {
226       return start_ == rhs.start_ && position_ == rhs.position_;
227     }
228 
229    private:
const_iterator(const TermIdHitPair * start,int position,int end_position)230     explicit const_iterator(const TermIdHitPair* start, int position,
231                             int end_position)
232         : start_(start), position_(position), end_position_(end_position) {}
233 
234     const TermIdHitPair* start_;
235     int position_;
236     int end_position_;
237   };
238 
begin()239   const_iterator begin() const ICING_LOCKS_EXCLUDED(mutex_) {
240     absl_ports::shared_lock l(&mutex_);
241     // If the LiteIndex is empty, just return end().
242     return empty_impl()
243                ? end()
244                : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0,
245                                 header_->cur_size());
246   }
247 
end()248   const_iterator end() const { return const_iterator(); }
249 
max_hit_buffer_size()250   constexpr static uint32_t max_hit_buffer_size() {
251     return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair);
252   }
253 
254   // We keep track of the last added document_id. This is always the largest
255   // document_id that has been added because hits can only be added in order of
256   // increasing document_id.
last_added_document_id()257   DocumentId last_added_document_id() const ICING_LOCKS_EXCLUDED(mutex_) {
258     absl_ports::shared_lock l(&mutex_);
259     return header_->last_added_docid();
260   }
set_last_added_document_id(DocumentId document_id)261   void set_last_added_document_id(DocumentId document_id)
262       ICING_LOCKS_EXCLUDED(mutex_) {
263     absl_ports::unique_lock l(&mutex_);
264     header_->set_last_added_docid(document_id);
265   }
266 
267   // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate the reference
268   // returned here.
lexicon()269   const IcingDynamicTrie& lexicon() const { return lexicon_; }
270 
271   // Returns debug information for the index in out.
272   // verbosity = BASIC, simplest debug information - size of lexicon, hit buffer
273   // verbosity = DETAILED, more detailed debug information from the lexicon.
274   std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity)
275       ICING_LOCKS_EXCLUDED(mutex_);
276 
277   // Returns the byte size of all the elements held in the index. This excludes
278   // the size of any internal metadata of the index, e.g. the index's header.
279   //
280   // Returns:
281   //   Byte size on success
282   //   INTERNAL_ERROR on IO error
283   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const
284       ICING_LOCKS_EXCLUDED(mutex_);
285 
286   // Takes the provided storage_info, populates the fields related to the lite
287   // index and returns that storage_info.
288   //
289   // If an IO error occurs while trying to calculate the value for a field, then
290   // that field will be set to -1.
291   IndexStorageInfoProto GetStorageInfo(IndexStorageInfoProto storage_info) const
292       ICING_LOCKS_EXCLUDED(mutex_);
293 
294   // Reduces internal file sizes by reclaiming space of deleted documents.
295   //
296   // This method also sets the last_added_docid of the index to
297   // new_last_added_document_id.
298   //
299   // Returns:
300   //   OK on success
301   //   INTERNAL_ERROR on IO error, this indicates that the index may be in an
302   //                               invalid state and should be cleared.
303   libtextclassifier3::Status Optimize(
304       const std::vector<DocumentId>& document_id_old_to_new,
305       const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id)
306       ICING_LOCKS_EXCLUDED(mutex_);
307 
308  private:
309   static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
310 
311   LiteIndex(const Options& options, const IcingFilesystem* filesystem);
312 
313   // Initializes lite index from storage. Must be called exactly once after
314   // object construction.
315   //
316   // Returns:
317   //  OK on success
318   //  DATA_LOSS if the index was corrupted and cleared
319   //  INTERNAL on I/O error
320   libtextclassifier3::Status Initialize() ICING_LOCKS_EXCLUDED(mutex_);
321 
initialized()322   bool initialized() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
323     return header_ != nullptr;
324   }
325 
326   // Check if the hit buffer has reached its capacity.
327   bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_);
328 
sizeLocked()329   uint32_t sizeLocked() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
330     return header_->cur_size();
331   }
332 
333   // Non-locking implementation for empty().
empty_impl()334   bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
335     return size_impl() == 0;
336   }
337 
338   // Non-locking implementation for size().
size_impl()339   bool size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
340     return header_->cur_size();
341   }
342 
343   // Calculate the checksum of all sub-components of the LiteIndex
344   Crc32 ComputeChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
345 
346   // Sets the computed checksum in the header
347   void UpdateChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
348 
349   // Non-locking implementation for UpdateTermProperties.
350   libtextclassifier3::Status UpdateTermPropertiesImpl(uint32_t tvi,
351                                                       bool hasPrefixHits,
352                                                       NamespaceId namespace_id)
353       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
354 
355   // Whether or not the HitBuffer requires sorting.
NeedSort()356   bool NeedSort() ICING_LOCKS_EXCLUDED(mutex_) {
357     absl_ports::shared_lock l(&mutex_);
358     return header_->cur_size() - header_->searchable_end() > 0;
359   }
360 
361   // Sort hits stored in the index.
362   void SortHits() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
363 
364   // Returns the position of the first element with term_id, or the searchable
365   // end of the hit buffer if term_id is not present.
366   uint32_t Seek(uint32_t term_id) const ICING_SHARED_LOCKS_REQUIRED(mutex_);
367 
368   // File descriptor that points to where the header and hit buffer are written
369   // to.
370   ScopedFd hit_buffer_fd_ ICING_GUARDED_BY(mutex_);
371 
372   // Mmapped region past the header that stores the hits.
373   IcingArrayStorage hit_buffer_ ICING_GUARDED_BY(mutex_);
374 
375   // Crc checksum of the hits, excludes the header.
376   uint32_t hit_buffer_crc_ ICING_GUARDED_BY(mutex_);
377 
378   // Trie that maps indexed terms to their term id
379   IcingDynamicTrie lexicon_ ICING_GUARDED_BY(mutex_);
380 
381   // TODO(b/140437260): Port over to MemoryMappedFile
382   // Memory mapped region of the underlying file that reflects the header.
383   IcingMMapper header_mmap_ ICING_GUARDED_BY(mutex_);
384 
385   // Wrapper around the mmapped header that contains stats on the lite index.
386   std::unique_ptr<LiteIndex_Header> header_ ICING_GUARDED_BY(mutex_);
387 
388   // Options used to initialize the LiteIndex.
389   const Options options_;
390 
391   // TODO(b/139087650) Move to icing::Filesystem
392   const IcingFilesystem* const filesystem_;
393 
394   // Used to provide reader and writer locks
395   mutable absl_ports::shared_mutex mutex_;
396 };
397 
398 }  // namespace lib
399 }  // namespace icing
400 
401 #endif  // ICING_INDEX_LITE_INDEX_H_
402