• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_INDEX_H_
16 #define ICING_INDEX_INDEX_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <unordered_map>
22 #include <utility>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/file/filesystem.h"
28 #include "icing/index/hit/hit.h"
29 #include "icing/index/iterator/doc-hit-info-iterator.h"
30 #include "icing/index/lite/lite-index.h"
31 #include "icing/index/lite/term-id-hit-pair.h"
32 #include "icing/index/main/main-index-merger.h"
33 #include "icing/index/main/main-index.h"
34 #include "icing/index/term-id-codec.h"
35 #include "icing/index/term-metadata.h"
36 #include "icing/legacy/index/icing-filesystem.h"
37 #include "icing/proto/debug.pb.h"
38 #include "icing/proto/logging.pb.h"
39 #include "icing/proto/scoring.pb.h"
40 #include "icing/proto/storage.pb.h"
41 #include "icing/proto/term.pb.h"
42 #include "icing/schema/section.h"
43 #include "icing/store/document-id.h"
44 #include "icing/store/namespace-id.h"
45 #include "icing/store/suggestion-result-checker.h"
46 #include "icing/util/status-macros.h"
47 
48 namespace icing {
49 namespace lib {
50 
51 // The class representing the Icing search index. This index maps terms to hits
52 // (document_ids, section_ids).
53 // Content is added to the index through the Editor class - which also dedupes
54 // hits (calling Editor::AddHit with the same arguments will only result in the
55 // creation of a single hit).
56 // Ex.
57 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
58 // .                Index::Create(MakeIndexOptions()));
59 // Index::Editor editor = index->Edit(document_id, section_id,
60 //     TermMatchType::EXACT_ONLY); ICING_RETURN_IF_ERROR(editor.AddHit("foo"));
61 // ICING_RETURN_IF_ERROR(editor.AddHit("baz"));
62 //
63 // Content is retrieved from the index through the Iterator class.
64 // Ex.
65 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
66 // .                Index::Create(MakeIndexOptions()));
67 // ICING_ASSIGN_OR_RETURN(Index::Iterator iterator =
68 //     index->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
69 // while(iterator->Advance().ok())
70 //   ProcessResult(iterator->value());
71 class Index {
72  public:
73   struct Options {
OptionsOptions74     explicit Options(const std::string& base_dir, uint32_t index_merge_size,
75                      bool lite_index_sort_at_indexing,
76                      uint32_t lite_index_sort_size)
77         : base_dir(base_dir),
78           index_merge_size(index_merge_size),
79           lite_index_sort_at_indexing(lite_index_sort_at_indexing),
80           lite_index_sort_size(lite_index_sort_size) {}
81 
82     std::string base_dir;
83     int32_t index_merge_size;
84     bool lite_index_sort_at_indexing;
85     int32_t lite_index_sort_size;
86   };
87 
88   // Creates an instance of Index in the directory pointed by file_dir.
89   //
90   // Returns:
91   //   Valid Index on success
92   //   DATA_LOSS if the index was corrupt and had to be cleared
93   //   INVALID_ARGUMENT if options have invalid values
94   //   INTERNAL on I/O error
95   static libtextclassifier3::StatusOr<std::unique_ptr<Index>> Create(
96       const Options& options, const Filesystem* filesystem,
97       const IcingFilesystem* icing_filesystem);
98 
99   // Reads magic from existing flash (main) index file header. We need this
100   // during Icing initialization phase to determine the version.
101   //
102   // Returns
103   //   Valid magic on success
104   //   NOT_FOUND if the lite index doesn't exist
105   //   INTERNAL on I/O error
106   static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic(
107       const Filesystem* filesystem, const std::string& base_dir);
108 
109   // Clears all files created by the index. Returns OK if all files were
110   // cleared.
Reset()111   libtextclassifier3::Status Reset() {
112     ICING_RETURN_IF_ERROR(lite_index_->Reset());
113     return main_index_->Reset();
114   }
115 
116   // Brings components of the index into memory in anticipation of a query in
117   // order to reduce latency.
Warm()118   void Warm() {
119     lite_index_->Warm();
120     main_index_->Warm();
121   }
122 
123   // Syncs all the data and metadata changes to disk.
124   //
125   // Returns:
126   //   OK on success
127   //   INTERNAL on I/O errors
PersistToDisk()128   libtextclassifier3::Status PersistToDisk() {
129     ICING_RETURN_IF_ERROR(lite_index_->PersistToDisk());
130     return main_index_->PersistToDisk();
131   }
132 
133   // Discard parts of the index if they contain data for document ids greater
134   // than document_id.
135   //
136   // NOTE: This means that TruncateTo(kInvalidDocumentId) will have no effect.
137   //
138   // Returns:
139   //   OK on success
140   //   INTERNAL on I/O errors
141   libtextclassifier3::Status TruncateTo(DocumentId document_id);
142 
143   // DocumentIds are always inserted in increasing order. Returns the largest
144   // document_id added to the index.
last_added_document_id()145   DocumentId last_added_document_id() const {
146     DocumentId lite_document_id = lite_index_->last_added_document_id();
147     if (lite_document_id != kInvalidDocumentId) {
148       return lite_document_id;
149     }
150     return main_index_->last_added_document_id();
151   }
152 
153   // Sets last_added_document_id to document_id so long as document_id >
154   // last_added_document_id()
set_last_added_document_id(DocumentId document_id)155   void set_last_added_document_id(DocumentId document_id) {
156     DocumentId lite_document_id = lite_index_->last_added_document_id();
157     if (lite_document_id == kInvalidDocumentId ||
158         document_id >= lite_document_id) {
159       lite_index_->set_last_added_document_id(document_id);
160     }
161   }
162 
163   // Returns debug information for the index in out.
164   // verbosity = BASIC, simplest debug information - just the lexicons and lite
165   //                    index.
166   // verbosity = DETAILED, more detailed debug information including raw
167   //                       postings lists.
GetDebugInfo(DebugInfoVerbosity::Code verbosity)168   IndexDebugInfoProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
169     IndexDebugInfoProto debug_info;
170     *debug_info.mutable_index_storage_info() = GetStorageInfo();
171     *debug_info.mutable_lite_index_info() =
172         lite_index_->GetDebugInfo(verbosity);
173     *debug_info.mutable_main_index_info() =
174         main_index_->GetDebugInfo(verbosity);
175     return debug_info;
176   }
177 
PublishQueryStats(QueryStatsProto * query_stats)178   void PublishQueryStats(QueryStatsProto* query_stats) const {
179     query_stats->set_lite_index_hit_buffer_byte_size(
180         lite_index_->GetHitBufferByteSize());
181     query_stats->set_lite_index_hit_buffer_unsorted_byte_size(
182         lite_index_->GetHitBufferUnsortedByteSize());
183   }
184 
185   // Returns the byte size of the all the elements held in the index. This
186   // excludes the size of any internal metadata of the index, e.g. the index's
187   // header.
188   //
189   // Returns:
190   //   Byte size on success
191   //   INTERNAL_ERROR on IO error
GetElementsSize()192   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const {
193     ICING_ASSIGN_OR_RETURN(int64_t lite_index_size,
194                            lite_index_->GetElementsSize());
195     ICING_ASSIGN_OR_RETURN(int64_t main_index_size,
196                            main_index_->GetElementsSize());
197     return lite_index_size + main_index_size;
198   }
199 
200   // Calculates the StorageInfo for the Index.
201   //
202   // If an IO error occurs while trying to calculate the value for a field, then
203   // that field will be set to -1.
204   IndexStorageInfoProto GetStorageInfo() const;
205 
206   // Create an iterator to iterate through all doc hit infos in the index that
207   // match the term. term_start_index is the start index of the given term in
208   // the search query. unnormalized_term_length is the length of the given
209   // unnormalized term in the search query not listed in the mask.
210   // Eg. section_id_mask = 1U << 3; would only return hits that occur in
211   // section 3.
212   //
213   // Returns:
214   //   unique ptr to a valid DocHitInfoIterator that matches the term
215   //   INVALID_ARGUMENT if given an invalid term_match_type
216   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
217       const std::string& term, int term_start_index,
218       int unnormalized_term_length, SectionIdMask section_id_mask,
219       TermMatchType::Code term_match_type, bool need_hit_term_frequency = true);
220 
221   // Finds terms with the given prefix in the given namespaces. If
222   // 'namespace_ids' is empty, returns results from all the namespaces. Results
223   // are sorted in decreasing order of hit count. Number of results are no more
224   // than 'num_to_return'.
225   //
226   // Returns:
227   //   A list of TermMetadata on success
228   //   INTERNAL_ERROR if failed to access term data.
229   libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
230       const std::string& prefix, int num_to_return,
231       TermMatchType::Code scoring_match_type,
232       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
233       const SuggestionResultChecker* suggestion_result_checker);
234 
235   // A class that can be used to add hits to the index.
236   //
237   // An editor groups hits from a particular section within a document together
238   // and dedupes hits for the same term within a section. This removes the
239   // burden of deduping from the caller and direct access to the index
240   // implementation allows for more efficient deduping.
241   class Editor {
242    public:
243     // Does not take any ownership, and all pointers must refer to valid objects
244     // that outlive the one constructed.
245     // TODO(b/141180665): Add nullptr checks for the raw pointers
Editor(const TermIdCodec * term_id_codec,LiteIndex * lite_index,DocumentId document_id,SectionId section_id,TermMatchType::Code term_match_type,NamespaceId namespace_id)246     Editor(const TermIdCodec* term_id_codec, LiteIndex* lite_index,
247            DocumentId document_id, SectionId section_id,
248            TermMatchType::Code term_match_type, NamespaceId namespace_id)
249         : term_id_codec_(term_id_codec),
250           lite_index_(lite_index),
251           document_id_(document_id),
252           term_match_type_(term_match_type),
253           namespace_id_(namespace_id),
254           section_id_(section_id) {}
255 
256     // Buffer the term in seen_tokens_.
257     libtextclassifier3::Status BufferTerm(const char* term);
258     // Index all the terms stored in seen_tokens_.
259     libtextclassifier3::Status IndexAllBufferedTerms();
260 
261    private:
262     // The Editor is able to store previously seen terms as TermIds. This is
263     // is more efficient than a client doing this externally because TermIds are
264     // not exposed to clients.
265     std::unordered_map<uint32_t, Hit::TermFrequency> seen_tokens_;
266     const TermIdCodec* term_id_codec_;
267     LiteIndex* lite_index_;
268     DocumentId document_id_;
269     TermMatchType::Code term_match_type_;
270     NamespaceId namespace_id_;
271     SectionId section_id_;
272   };
Edit(DocumentId document_id,SectionId section_id,TermMatchType::Code term_match_type,NamespaceId namespace_id)273   Editor Edit(DocumentId document_id, SectionId section_id,
274               TermMatchType::Code term_match_type, NamespaceId namespace_id) {
275     return Editor(term_id_codec_.get(), lite_index_.get(), document_id,
276                   section_id, term_match_type, namespace_id);
277   }
278 
WantsMerge()279   bool WantsMerge() const { return lite_index_->WantsMerge(); }
280 
281   // Merges newly-added hits in the LiteIndex into the MainIndex.
282   //
283   // RETURNS:
284   //  - INTERNAL on IO error while writing to the MainIndex.
285   //  - RESOURCE_EXHAUSTED error if unable to grow the index.
Merge()286   libtextclassifier3::Status Merge() {
287     ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
288                            main_index_->MergeLexicon(lite_index_->lexicon()));
289     ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs,
290                            MainIndexMerger::TranslateAndExpandLiteHits(
291                                *lite_index_, *term_id_codec_, outputs));
292     ICING_RETURN_IF_ERROR(main_index_->AddHits(
293         *term_id_codec_, std::move(outputs.backfill_map),
294         std::move(term_id_hit_pairs), lite_index_->last_added_document_id()));
295     ICING_RETURN_IF_ERROR(main_index_->PersistToDisk());
296     return lite_index_->Reset();
297   }
298 
299   // Whether the LiteIndex HitBuffer requires sorting. This is only true if
300   // Icing has enabled sorting during indexing time, and the HitBuffer's
301   // unsorted tail has exceeded the lite_index_sort_size.
LiteIndexNeedSort()302   bool LiteIndexNeedSort() const {
303     return options_.lite_index_sort_at_indexing &&
304            lite_index_->HasUnsortedHitsExceedingSortThreshold();
305   }
306 
307   // Sorts the LiteIndex HitBuffer.
SortLiteIndex()308   void SortLiteIndex() { lite_index_->SortHits(); }
309 
310   // Reduces internal file sizes by reclaiming space of deleted documents.
311   // new_last_added_document_id will be used to update the last added document
312   // id in the lite index.
313   //
314   // Returns:
315   //   OK on success
316   //   INTERNAL_ERROR on IO error, this indicates that the index may be in an
317   //                               invalid state and should be cleared.
318   libtextclassifier3::Status Optimize(
319       const std::vector<DocumentId>& document_id_old_to_new,
320       DocumentId new_last_added_document_id);
321 
322  private:
Index(const Options & options,std::unique_ptr<TermIdCodec> term_id_codec,std::unique_ptr<LiteIndex> lite_index,std::unique_ptr<MainIndex> main_index,const Filesystem * filesystem)323   Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec,
324         std::unique_ptr<LiteIndex> lite_index,
325         std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem)
326       : lite_index_(std::move(lite_index)),
327         main_index_(std::move(main_index)),
328         options_(options),
329         term_id_codec_(std::move(term_id_codec)),
330         filesystem_(filesystem) {}
331 
332   libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix(
333       const std::string& prefix,
334       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
335       const SuggestionResultChecker* suggestion_result_checker);
336 
337   std::unique_ptr<LiteIndex> lite_index_;
338   std::unique_ptr<MainIndex> main_index_;
339   const Options options_;
340   std::unique_ptr<TermIdCodec> term_id_codec_;
341   const Filesystem* filesystem_;
342 };
343 
344 }  // namespace lib
345 }  // namespace icing
346 
347 #endif  // ICING_INDEX_INDEX_H_
348